{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998661849324234, "eval_steps": 500, "global_step": 3736, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 748.0, "completions/max_terminated_length": 748.0, "completions/mean_length": 447.75, "completions/mean_terminated_length": 447.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.00026763013515321824, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057988099047390175, "kl": 0.00026988983154296875, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 4714.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 349.0, "completions/mean_terminated_length": 349.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.0005352602703064365, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035249041394515156, "kl": 0.00010037422180175781, "learning_rate": 2.67379679144385e-09, "loss": 0.0, "num_tokens": 8566.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 363.0, "completions/mean_terminated_length": 363.0, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.0008028904054596548, "frac_reward_zero_std": 1.0, "grad_norm": 0.005189577315977487, "kl": 8.511543273925781e-05, "learning_rate": 5.3475935828877e-09, "loss": 0.0, "num_tokens": 12566.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.001070520540612873, "frac_reward_zero_std": 1.0, "grad_norm": 0.006572052866784485, "kl": 0.000164031982421875, "learning_rate": 8.021390374331551e-09, "loss": 0.0, "num_tokens": 15942.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.0013381506757660913, "frac_reward_zero_std": 1.0, "grad_norm": 0.007416422660973104, "kl": 0.0001850128173828125, "learning_rate": 1.06951871657754e-08, "loss": 0.0, "num_tokens": 18754.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.0016057808109193096, "frac_reward_zero_std": 1.0, "grad_norm": 0.00580507028860041, "kl": 0.00013947486877441406, "learning_rate": 1.336898395721925e-08, "loss": 0.0, "num_tokens": 21985.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.0018734109460725277, "frac_reward_zero_std": 1.0, "grad_norm": 0.007842026173873974, "kl": 0.00013184547424316406, "learning_rate": 1.6042780748663103e-08, "loss": 0.0, "num_tokens": 25889.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 360.375, "completions/mean_terminated_length": 360.375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.002141041081225746, "frac_reward_zero_std": 0.5, "grad_norm": 1.0918362962903212, "kl": 0.000148773193359375, "learning_rate": 1.8716577540106948e-08, "loss": 0.006, "num_tokens": 29884.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 354.75, "completions/mean_terminated_length": 354.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.0024086712163789645, "frac_reward_zero_std": 1.0, "grad_norm": 0.006057577199210279, "kl": 0.00020551681518554688, "learning_rate": 2.13903743315508e-08, "loss": 0.0, "num_tokens": 33798.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.0026763013515321826, "frac_reward_zero_std": 1.0, "grad_norm": 0.007058111005756307, "kl": 0.00017571449279785156, "learning_rate": 2.406417112299465e-08, "loss": 0.0, "num_tokens": 37332.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.0029439314866854006, "frac_reward_zero_std": 0.5, "grad_norm": 1.350883386080629, "kl": 0.000247955322265625, "learning_rate": 2.67379679144385e-08, "loss": -0.0231, "num_tokens": 40872.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 202.625, "completions/mean_terminated_length": 202.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.003211561621838619, "frac_reward_zero_std": 1.0, "grad_norm": 0.005238069661142666, "kl": 9.131431579589844e-05, "learning_rate": 2.941176470588235e-08, "loss": 0.0, "num_tokens": 43497.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.0034791917569918372, "frac_reward_zero_std": 1.0, "grad_norm": 0.005314998939948432, "kl": 0.00010180473327636719, "learning_rate": 3.2085561497326206e-08, "loss": 0.0, "num_tokens": 46275.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.0037468218921450553, "frac_reward_zero_std": 1.0, "grad_norm": 0.009539379270586608, "kl": 0.000194549560546875, "learning_rate": 3.475935828877005e-08, "loss": 0.0, "num_tokens": 49153.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 470.125, "completions/mean_terminated_length": 470.125, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.004014452027298274, "frac_reward_zero_std": 0.5, "grad_norm": 0.5310462410941966, "kl": 0.0001068115234375, "learning_rate": 3.7433155080213896e-08, "loss": 0.0129, "num_tokens": 54010.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 427.75, "completions/mean_terminated_length": 342.5714416503906, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.004282082162451492, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033486207389127904, "kl": 5.543231964111328e-05, "learning_rate": 4.0106951871657754e-08, "loss": 0.0, "num_tokens": 58352.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.00454971229760471, "frac_reward_zero_std": 1.0, "grad_norm": 0.004273730854964133, "kl": 6.794929504394531e-05, "learning_rate": 4.27807486631016e-08, "loss": 0.0, "num_tokens": 61867.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.004817342432757929, "frac_reward_zero_std": 1.0, "grad_norm": 0.004025321282128191, "kl": 0.00014448165893554688, "learning_rate": 4.545454545454545e-08, "loss": 0.0, "num_tokens": 65503.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.005084972567911147, "frac_reward_zero_std": 1.0, "grad_norm": 0.009693117023461064, "kl": 8.785724639892578e-05, "learning_rate": 4.81283422459893e-08, "loss": 0.0, "num_tokens": 68325.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.005352602703064365, "frac_reward_zero_std": 1.0, "grad_norm": 0.005094384956989145, "kl": 0.00014781951904296875, "learning_rate": 5.0802139037433154e-08, "loss": 0.0, "num_tokens": 70977.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 383.25, "completions/mean_terminated_length": 383.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.005620232838217583, "frac_reward_zero_std": 1.0, "grad_norm": 0.007596811484957952, "kl": 0.00018525123596191406, "learning_rate": 5.3475935828877e-08, "loss": 0.0, "num_tokens": 75259.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 356.625, "completions/mean_terminated_length": 356.625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.005887862973370801, "frac_reward_zero_std": 0.0, "grad_norm": 1.1173882211852215, "kl": 0.0001392364501953125, "learning_rate": 5.614973262032086e-08, "loss": 0.0331, "num_tokens": 79136.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.00615549310852402, "frac_reward_zero_std": 1.0, "grad_norm": 0.00607142284842563, "kl": 0.00011396408081054688, "learning_rate": 5.88235294117647e-08, "loss": 0.0, "num_tokens": 81984.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 368.875, "completions/mean_terminated_length": 368.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.006423123243677238, "frac_reward_zero_std": 1.0, "grad_norm": 0.004089263004054082, "kl": 0.00012445449829101562, "learning_rate": 6.149732620320855e-08, "loss": 0.0, "num_tokens": 86019.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 303.75, "completions/mean_terminated_length": 303.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.006690753378830456, "frac_reward_zero_std": 1.0, "grad_norm": 0.05057569120447194, "kl": 0.00039386749267578125, "learning_rate": 6.417112299465241e-08, "loss": 0.0, "num_tokens": 89461.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.0069583835139836745, "frac_reward_zero_std": 1.0, "grad_norm": 0.009196844209555846, "kl": 0.00011968612670898438, "learning_rate": 6.684491978609626e-08, "loss": 0.0, "num_tokens": 92773.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.0072260136491368926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034899479713966196, "kl": 4.9233436584472656e-05, "learning_rate": 6.95187165775401e-08, "loss": 0.0, "num_tokens": 95943.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 689.0, "completions/max_terminated_length": 689.0, "completions/mean_length": 509.875, "completions/mean_terminated_length": 509.875, "completions/min_length": 347.0, "completions/min_terminated_length": 347.0, "epoch": 0.007493643784290111, "frac_reward_zero_std": 0.0, "grad_norm": 1.1027322590218034, "kl": 0.00013256072998046875, "learning_rate": 7.219251336898395e-08, "loss": -0.0402, "num_tokens": 101102.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.00776127391944333, "frac_reward_zero_std": 1.0, "grad_norm": 0.011677146644830818, "kl": 0.0002346038818359375, "learning_rate": 7.486631016042779e-08, "loss": 0.0, "num_tokens": 104676.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 293.5, "completions/mean_terminated_length": 293.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.008028904054596548, "frac_reward_zero_std": 1.0, "grad_norm": 0.004562332363022864, "kl": 6.949901580810547e-05, "learning_rate": 7.754010695187166e-08, "loss": 0.0, "num_tokens": 108040.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.008296534189749766, "frac_reward_zero_std": 1.0, "grad_norm": 0.005923679911295556, "kl": 0.00012683868408203125, "learning_rate": 8.021390374331551e-08, "loss": 0.0, "num_tokens": 111837.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.008564164324902984, "frac_reward_zero_std": 1.0, "grad_norm": 0.006456029719591425, "kl": 0.00024318695068359375, "learning_rate": 8.288770053475935e-08, "loss": 0.0, "num_tokens": 115344.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.008831794460056202, "frac_reward_zero_std": 0.5, "grad_norm": 0.5159321604611392, "kl": 0.00012159347534179688, "learning_rate": 8.55614973262032e-08, "loss": 0.0113, "num_tokens": 119224.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 335.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.00909942459520942, "frac_reward_zero_std": 1.0, "grad_norm": 0.007452361995490628, "kl": 0.00016260147094726562, "learning_rate": 8.823529411764706e-08, "loss": 0.0, "num_tokens": 123095.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.009367054730362638, "frac_reward_zero_std": 1.0, "grad_norm": 0.005920838218994593, "kl": 0.00017786026000976562, "learning_rate": 9.09090909090909e-08, "loss": 0.0, "num_tokens": 126798.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 423.625, "completions/mean_terminated_length": 423.625, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.009634684865515858, "frac_reward_zero_std": 1.0, "grad_norm": 0.0035974282926108936, "kl": 0.00014209747314453125, "learning_rate": 9.358288770053476e-08, "loss": 0.0, "num_tokens": 131231.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 330.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.009902315000669076, "frac_reward_zero_std": 1.0, "grad_norm": 0.00600584616498382, "kl": 0.00018596649169921875, "learning_rate": 9.62566844919786e-08, "loss": 0.0, "num_tokens": 135166.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.010169945135822294, "frac_reward_zero_std": 1.0, "grad_norm": 0.002037699231613498, "kl": 4.398822784423828e-05, "learning_rate": 9.893048128342246e-08, "loss": 0.0, "num_tokens": 138317.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 241.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.010437575270975512, "frac_reward_zero_std": 1.0, "grad_norm": 0.005924799759532177, "kl": 0.00019741058349609375, "learning_rate": 1.0160427807486631e-07, "loss": 0.0, "num_tokens": 141193.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.01070520540612873, "frac_reward_zero_std": 1.0, "grad_norm": 0.01566729578024087, "kl": 0.00020933151245117188, "learning_rate": 1.0427807486631015e-07, "loss": 0.0, "num_tokens": 144721.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.010972835541281948, "frac_reward_zero_std": 1.0, "grad_norm": 0.00599672527849537, "kl": 0.000118255615234375, "learning_rate": 1.06951871657754e-07, "loss": 0.0, "num_tokens": 148987.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.011240465676435166, "frac_reward_zero_std": 1.0, "grad_norm": 0.006119486361613268, "kl": 0.00017118453979492188, "learning_rate": 1.0962566844919786e-07, "loss": 0.0, "num_tokens": 152104.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 319.375, "completions/mean_terminated_length": 319.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.011508095811588384, "frac_reward_zero_std": 1.0, "grad_norm": 0.010990697336740636, "kl": 0.0001766681671142578, "learning_rate": 1.1229946524064171e-07, "loss": 0.0, "num_tokens": 155563.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 296.375, "completions/mean_terminated_length": 296.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.011775725946741603, "frac_reward_zero_std": 1.0, "grad_norm": 0.004217914634217612, "kl": 0.0001354217529296875, "learning_rate": 1.1497326203208556e-07, "loss": 0.0, "num_tokens": 158974.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.01204335608189482, "frac_reward_zero_std": 0.5, "grad_norm": 0.9581646885879734, "kl": 0.00019311904907226562, "learning_rate": 1.176470588235294e-07, "loss": -0.0183, "num_tokens": 162878.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.01231098621704804, "frac_reward_zero_std": 1.0, "grad_norm": 0.008595083841009403, "kl": 0.00021839141845703125, "learning_rate": 1.2032085561497325e-07, "loss": 0.0, "num_tokens": 166244.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 401.75, "completions/mean_terminated_length": 401.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.012578616352201259, "frac_reward_zero_std": 1.0, "grad_norm": 0.004123384365508188, "kl": 0.0001227855682373047, "learning_rate": 1.229946524064171e-07, "loss": 0.0, "num_tokens": 170802.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.012846246487354477, "frac_reward_zero_std": 1.0, "grad_norm": 0.006758297566279814, "kl": 0.0001862049102783203, "learning_rate": 1.2566844919786097e-07, "loss": 0.0, "num_tokens": 173966.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.013113876622507695, "frac_reward_zero_std": 1.0, "grad_norm": 0.010581887910201307, "kl": 7.939338684082031e-05, "learning_rate": 1.2834224598930482e-07, "loss": 0.0, "num_tokens": 177064.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.013381506757660913, "frac_reward_zero_std": 1.0, "grad_norm": 0.007017331753674308, "kl": 0.0002231597900390625, "learning_rate": 1.3101604278074866e-07, "loss": 0.0, "num_tokens": 180683.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 371.75, "completions/mean_terminated_length": 371.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.01364913689281413, "frac_reward_zero_std": 0.0, "grad_norm": 1.1484330734855057, "kl": 0.00015974044799804688, "learning_rate": 1.3368983957219251e-07, "loss": -0.0183, "num_tokens": 184949.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.013916767027967349, "frac_reward_zero_std": 1.0, "grad_norm": 0.00502680752950606, "kl": 9.59634780883789e-05, "learning_rate": 1.3636363636363635e-07, "loss": 0.0, "num_tokens": 188134.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.014184397163120567, "frac_reward_zero_std": 1.0, "grad_norm": 0.007510514121775174, "kl": 0.00015020370483398438, "learning_rate": 1.390374331550802e-07, "loss": 0.0, "num_tokens": 191487.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 408.125, "completions/mean_terminated_length": 408.125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.014452027298273785, "frac_reward_zero_std": 0.5, "grad_norm": 0.7662558598055371, "kl": 0.0001232624053955078, "learning_rate": 1.4171122994652406e-07, "loss": 0.0138, "num_tokens": 195796.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 434.75, "completions/mean_terminated_length": 434.75, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.014719657433427003, "frac_reward_zero_std": 0.5, "grad_norm": 0.6844730430589362, "kl": 0.00014352798461914062, "learning_rate": 1.443850267379679e-07, "loss": -0.036, "num_tokens": 200474.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 375.375, "completions/mean_terminated_length": 375.375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.014987287568580221, "frac_reward_zero_std": 1.0, "grad_norm": 0.007291770324529525, "kl": 0.00014066696166992188, "learning_rate": 1.4705882352941175e-07, "loss": 0.0, "num_tokens": 204673.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.015254917703733441, "frac_reward_zero_std": 1.0, "grad_norm": 0.004858938791932421, "kl": 0.00017547607421875, "learning_rate": 1.4973262032085558e-07, "loss": 0.0, "num_tokens": 208586.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.01552254783888666, "frac_reward_zero_std": 1.0, "grad_norm": 0.005137936524847391, "kl": 0.0001621246337890625, "learning_rate": 1.5240641711229947e-07, "loss": 0.0, "num_tokens": 212641.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 389.625, "completions/mean_terminated_length": 389.625, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.015790177974039876, "frac_reward_zero_std": 1.0, "grad_norm": 0.008563408399094675, "kl": 0.00018548965454101562, "learning_rate": 1.5508021390374333e-07, "loss": 0.0, "num_tokens": 216846.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 417.5, "completions/mean_terminated_length": 330.8571472167969, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.016057808109193095, "frac_reward_zero_std": 0.5, "grad_norm": 0.6017606381872099, "kl": 0.0001227855682373047, "learning_rate": 1.5775401069518716e-07, "loss": -0.1315, "num_tokens": 221502.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 314.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.01632543824434631, "frac_reward_zero_std": 0.5, "grad_norm": 0.6734493281427368, "kl": 0.00015807151794433594, "learning_rate": 1.6042780748663102e-07, "loss": 0.0403, "num_tokens": 225049.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.01659306837949953, "frac_reward_zero_std": 0.5, "grad_norm": 0.6952662063662977, "kl": 0.00015854835510253906, "learning_rate": 1.6310160427807487e-07, "loss": 0.0, "num_tokens": 229210.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 307.5, "completions/mean_terminated_length": 307.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.01686069851465275, "frac_reward_zero_std": 1.0, "grad_norm": 0.007165779063017118, "kl": 0.00014901161193847656, "learning_rate": 1.657754010695187e-07, "loss": 0.0, "num_tokens": 232758.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.017128328649805968, "frac_reward_zero_std": 1.0, "grad_norm": 0.004685448833744657, "kl": 0.0001227855682373047, "learning_rate": 1.6844919786096256e-07, "loss": 0.0, "num_tokens": 236127.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 357.875, "completions/mean_terminated_length": 357.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.017395958784959187, "frac_reward_zero_std": 1.0, "grad_norm": 0.007551347269057193, "kl": 0.00021457672119140625, "learning_rate": 1.711229946524064e-07, "loss": 0.0, "num_tokens": 240118.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 216.75, "completions/mean_terminated_length": 216.75, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.017663588920112404, "frac_reward_zero_std": 1.0, "grad_norm": 0.004337973047238674, "kl": 8.20159912109375e-05, "learning_rate": 1.7379679144385025e-07, "loss": 0.0, "num_tokens": 242792.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 492.375, "completions/mean_terminated_length": 492.375, "completions/min_length": 396.0, "completions/min_terminated_length": 396.0, "epoch": 0.017931219055265624, "frac_reward_zero_std": 0.5, "grad_norm": 0.7918232630169955, "kl": 0.000274658203125, "learning_rate": 1.764705882352941e-07, "loss": -0.0033, "num_tokens": 247875.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 289.875, "completions/mean_terminated_length": 289.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.01819884919041884, "frac_reward_zero_std": 1.0, "grad_norm": 0.0074647379964558795, "kl": 0.00020837783813476562, "learning_rate": 1.7914438502673794e-07, "loss": 0.0, "num_tokens": 251250.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 336.25, "completions/mean_terminated_length": 336.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.01846647932557206, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058254675864168205, "kl": 0.00012993812561035156, "learning_rate": 1.818181818181818e-07, "loss": 0.0, "num_tokens": 255048.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 329.375, "completions/mean_terminated_length": 329.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.018734109460725276, "frac_reward_zero_std": 1.0, "grad_norm": 0.007169126768826184, "kl": 0.00020122528076171875, "learning_rate": 1.8449197860962566e-07, "loss": 0.0, "num_tokens": 258779.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.019001739595878496, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061526767309464095, "kl": 0.0001678466796875, "learning_rate": 1.8716577540106952e-07, "loss": 0.0, "num_tokens": 262094.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.019269369731031716, "frac_reward_zero_std": 1.0, "grad_norm": 0.006268847738439298, "kl": 0.000232696533203125, "learning_rate": 1.8983957219251338e-07, "loss": 0.0, "num_tokens": 265524.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.019536999866184932, "frac_reward_zero_std": 0.5, "grad_norm": 0.9358202399602956, "kl": 0.0003223419189453125, "learning_rate": 1.925133689839572e-07, "loss": 0.0, "num_tokens": 268821.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 320.375, "completions/mean_terminated_length": 320.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.019804630001338152, "frac_reward_zero_std": 1.0, "grad_norm": 0.005580415755397765, "kl": 8.749961853027344e-05, "learning_rate": 1.9518716577540107e-07, "loss": 0.0, "num_tokens": 272512.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 498.375, "completions/mean_terminated_length": 498.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.02007226013649137, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037578193042271265, "kl": 8.0108642578125e-05, "learning_rate": 1.9786096256684493e-07, "loss": 0.0, "num_tokens": 277591.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 324.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.020339890271644588, "frac_reward_zero_std": 0.5, "grad_norm": 0.5595256400995273, "kl": 0.0001016855239868164, "learning_rate": 2.0053475935828876e-07, "loss": -0.0198, "num_tokens": 281288.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 370.125, "completions/mean_terminated_length": 370.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.020607520406797804, "frac_reward_zero_std": 1.0, "grad_norm": 0.008720671645661806, "kl": 7.843971252441406e-05, "learning_rate": 2.0320855614973262e-07, "loss": 0.0, "num_tokens": 285297.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 308.75, "completions/mean_terminated_length": 308.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.020875150541951024, "frac_reward_zero_std": 1.0, "grad_norm": 0.00513715525316272, "kl": 0.00010466575622558594, "learning_rate": 2.0588235294117645e-07, "loss": 0.0, "num_tokens": 288755.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 216.0, "completions/mean_terminated_length": 216.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.02114278067710424, "frac_reward_zero_std": 1.0, "grad_norm": 0.006263995893753492, "kl": 0.00010514259338378906, "learning_rate": 2.085561497326203e-07, "loss": 0.0, "num_tokens": 291399.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 244.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.02141041081225746, "frac_reward_zero_std": 0.5, "grad_norm": 1.1465301197785622, "kl": 0.00021266937255859375, "learning_rate": 2.1122994652406416e-07, "loss": -0.0214, "num_tokens": 294630.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.021678040947410677, "frac_reward_zero_std": 1.0, "grad_norm": 0.00512435719984408, "kl": 0.00015306472778320312, "learning_rate": 2.13903743315508e-07, "loss": 0.0, "num_tokens": 297682.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.021945671082563897, "frac_reward_zero_std": 1.0, "grad_norm": 0.008883753965481992, "kl": 0.0002651214599609375, "learning_rate": 2.1657754010695188e-07, "loss": 0.0, "num_tokens": 300759.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 391.75, "completions/mean_terminated_length": 391.75, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.022213301217717116, "frac_reward_zero_std": 0.5, "grad_norm": 0.8359185887618569, "kl": 0.00013971328735351562, "learning_rate": 2.192513368983957e-07, "loss": 0.01, "num_tokens": 304945.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 235.0, "completions/max_terminated_length": 235.0, "completions/mean_length": 197.875, "completions/mean_terminated_length": 197.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.022480931352870333, "frac_reward_zero_std": 1.0, "grad_norm": 0.009864560076557427, "kl": 0.00029277801513671875, "learning_rate": 2.2192513368983957e-07, "loss": 0.0, "num_tokens": 307444.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.022748561488023553, "frac_reward_zero_std": 1.0, "grad_norm": 0.006199012460600326, "kl": 0.00021457672119140625, "learning_rate": 2.2459893048128343e-07, "loss": 0.0, "num_tokens": 310659.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 409.375, "completions/mean_terminated_length": 409.375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.02301619162317677, "frac_reward_zero_std": 1.0, "grad_norm": 0.005765563854636453, "kl": 0.00015664100646972656, "learning_rate": 2.2727272727272726e-07, "loss": 0.0, "num_tokens": 315066.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 350.75, "completions/mean_terminated_length": 350.75, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.02328382175832999, "frac_reward_zero_std": 1.0, "grad_norm": 0.005193071634394941, "kl": 6.249547004699707e-05, "learning_rate": 2.2994652406417112e-07, "loss": 0.0, "num_tokens": 319036.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.023551451893483205, "frac_reward_zero_std": 1.0, "grad_norm": 0.007150294688273708, "kl": 0.00012159347534179688, "learning_rate": 2.3262032085561498e-07, "loss": 0.0, "num_tokens": 322203.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 395.125, "completions/mean_terminated_length": 395.125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.023819082028636425, "frac_reward_zero_std": 0.5, "grad_norm": 0.8686582408957065, "kl": 0.0001742839813232422, "learning_rate": 2.352941176470588e-07, "loss": 0.0, "num_tokens": 326684.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 308.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.02408671216378964, "frac_reward_zero_std": 1.0, "grad_norm": 0.004940357671715304, "kl": 0.00010323524475097656, "learning_rate": 2.3796791443850267e-07, "loss": 0.0, "num_tokens": 330151.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.02435434229894286, "frac_reward_zero_std": 1.0, "grad_norm": 0.005265176279940162, "kl": 6.830692291259766e-05, "learning_rate": 2.406417112299465e-07, "loss": 0.0, "num_tokens": 333203.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 616.75, "completions/mean_terminated_length": 481.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.02462197243409608, "frac_reward_zero_std": 1.0, "grad_norm": 0.004825469225958429, "kl": 0.00016069412231445312, "learning_rate": 2.433155080213904e-07, "loss": 0.0, "num_tokens": 339409.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.024889602569249297, "frac_reward_zero_std": 1.0, "grad_norm": 0.00793342408915161, "kl": 0.00020933151245117188, "learning_rate": 2.459893048128342e-07, "loss": 0.0, "num_tokens": 342318.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.025157232704402517, "frac_reward_zero_std": 1.0, "grad_norm": 0.004995933243426946, "kl": 0.00014448165893554688, "learning_rate": 2.4866310160427805e-07, "loss": 0.0, "num_tokens": 345255.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.025424862839555733, "frac_reward_zero_std": 1.0, "grad_norm": 0.006843193239450456, "kl": 0.00016307830810546875, "learning_rate": 2.5133689839572193e-07, "loss": 0.0, "num_tokens": 347937.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 237.875, "completions/mean_terminated_length": 237.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.025692492974708953, "frac_reward_zero_std": 1.0, "grad_norm": 0.004290552175079954, "kl": 0.00010204315185546875, "learning_rate": 2.5401069518716576e-07, "loss": 0.0, "num_tokens": 350732.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.02596012310986217, "frac_reward_zero_std": 1.0, "grad_norm": 0.00690868232134881, "kl": 0.0001544952392578125, "learning_rate": 2.5668449197860965e-07, "loss": 0.0, "num_tokens": 354191.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.02622775324501539, "frac_reward_zero_std": 1.0, "grad_norm": 0.005625083094261301, "kl": 0.0001678466796875, "learning_rate": 2.593582887700534e-07, "loss": 0.0, "num_tokens": 357504.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.026495383380168606, "frac_reward_zero_std": 0.5, "grad_norm": 1.2789551687545289, "kl": 0.00013017654418945312, "learning_rate": 2.620320855614973e-07, "loss": -0.104, "num_tokens": 360790.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.026763013515321826, "frac_reward_zero_std": 1.0, "grad_norm": 0.011570532167575313, "kl": 0.00017642974853515625, "learning_rate": 2.6470588235294114e-07, "loss": 0.0, "num_tokens": 364451.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 394.875, "completions/mean_terminated_length": 394.875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.027030643650475042, "frac_reward_zero_std": 1.0, "grad_norm": 0.004317908543882397, "kl": 0.00011157989501953125, "learning_rate": 2.6737967914438503e-07, "loss": 0.0, "num_tokens": 368746.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 299.375, "completions/mean_terminated_length": 299.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.02729827378562826, "frac_reward_zero_std": 0.5, "grad_norm": 0.6100202325712061, "kl": 0.00011515617370605469, "learning_rate": 2.700534759358289e-07, "loss": 0.0415, "num_tokens": 372209.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 217.375, "completions/mean_terminated_length": 217.375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.02756590392078148, "frac_reward_zero_std": 1.0, "grad_norm": 0.007203984158364468, "kl": 0.00012683868408203125, "learning_rate": 2.727272727272727e-07, "loss": 0.0, "num_tokens": 374992.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.027833534055934698, "frac_reward_zero_std": 0.5, "grad_norm": 0.6974970130102688, "kl": 0.00013947486877441406, "learning_rate": 2.754010695187166e-07, "loss": 0.0, "num_tokens": 378456.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.028101164191087918, "frac_reward_zero_std": 1.0, "grad_norm": 0.003997827041777393, "kl": 6.520748138427734e-05, "learning_rate": 2.780748663101604e-07, "loss": 0.0, "num_tokens": 381515.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.028368794326241134, "frac_reward_zero_std": 1.0, "grad_norm": 0.003719845415004631, "kl": 0.00014638900756835938, "learning_rate": 2.807486631016043e-07, "loss": 0.0, "num_tokens": 385091.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 329.75, "completions/mean_terminated_length": 329.75, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.028636424461394354, "frac_reward_zero_std": 1.0, "grad_norm": 0.004490336410161325, "kl": 0.00010752677917480469, "learning_rate": 2.834224598930481e-07, "loss": 0.0, "num_tokens": 388877.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.02890405459654757, "frac_reward_zero_std": 1.0, "grad_norm": 0.009412930264001626, "kl": 0.00013446807861328125, "learning_rate": 2.8609625668449196e-07, "loss": 0.0, "num_tokens": 392003.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.02917168473170079, "frac_reward_zero_std": 0.5, "grad_norm": 0.7779188980062675, "kl": 7.939338684082031e-05, "learning_rate": 2.887700534759358e-07, "loss": 0.0004, "num_tokens": 395284.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 374.25, "completions/mean_terminated_length": 374.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.029439314866854006, "frac_reward_zero_std": 1.0, "grad_norm": 0.005080185690990139, "kl": 0.0001850128173828125, "learning_rate": 2.9144385026737967e-07, "loss": 0.0, "num_tokens": 399406.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.029706945002007226, "frac_reward_zero_std": 0.5, "grad_norm": 0.6908833345633908, "kl": 0.0002918243408203125, "learning_rate": 2.941176470588235e-07, "loss": -0.0265, "num_tokens": 403327.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.029974575137160443, "frac_reward_zero_std": 0.5, "grad_norm": 1.037017876927161, "kl": 0.00021600723266601562, "learning_rate": 2.967914438502674e-07, "loss": 0.0271, "num_tokens": 406597.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.030242205272313662, "frac_reward_zero_std": 1.0, "grad_norm": 0.005513031669923925, "kl": 0.000152587890625, "learning_rate": 2.9946524064171117e-07, "loss": 0.0, "num_tokens": 409439.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.030509835407466882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041045257638919926, "kl": 9.644031524658203e-05, "learning_rate": 3.0213903743315505e-07, "loss": 0.0, "num_tokens": 412698.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 345.25, "completions/mean_terminated_length": 345.25, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.0307774655426201, "frac_reward_zero_std": 1.0, "grad_norm": 0.003923137884736292, "kl": 0.00010395050048828125, "learning_rate": 3.0481283422459894e-07, "loss": 0.0, "num_tokens": 416656.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.03104509567777332, "frac_reward_zero_std": 0.5, "grad_norm": 0.8758754189718172, "kl": 0.00028228759765625, "learning_rate": 3.0748663101604277e-07, "loss": 0.0, "num_tokens": 420416.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.03131272581292654, "frac_reward_zero_std": 1.0, "grad_norm": 0.006879435300329998, "kl": 0.00010037422180175781, "learning_rate": 3.1016042780748665e-07, "loss": 0.0, "num_tokens": 423499.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 241.125, "completions/mean_terminated_length": 241.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.03158035594807975, "frac_reward_zero_std": 1.0, "grad_norm": 0.007821409870719432, "kl": 0.00010442733764648438, "learning_rate": 3.1283422459893043e-07, "loss": 0.0, "num_tokens": 426556.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.03184798608323297, "frac_reward_zero_std": 0.5, "grad_norm": 0.5244161627763372, "kl": 0.00013399124145507812, "learning_rate": 3.155080213903743e-07, "loss": 0.0217, "num_tokens": 430890.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 391.0, "completions/mean_terminated_length": 391.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.03211561621838619, "frac_reward_zero_std": 0.5, "grad_norm": 0.7450879137794851, "kl": 0.00023794174194335938, "learning_rate": 3.1818181818181815e-07, "loss": -0.0061, "num_tokens": 435126.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 362.875, "completions/mean_terminated_length": 362.875, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.03238324635353941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054827809321797445, "kl": 0.00015497207641601562, "learning_rate": 3.2085561497326203e-07, "loss": 0.0, "num_tokens": 439153.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 293.75, "completions/mean_terminated_length": 293.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.03265087648869262, "frac_reward_zero_std": 1.0, "grad_norm": 0.00772435224506228, "kl": 0.00020885467529296875, "learning_rate": 3.2352941176470586e-07, "loss": 0.0, "num_tokens": 442555.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.03291850662384584, "frac_reward_zero_std": 1.0, "grad_norm": 0.007057108980632367, "kl": 0.00024890899658203125, "learning_rate": 3.2620320855614975e-07, "loss": 0.0, "num_tokens": 445608.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.03318613675899906, "frac_reward_zero_std": 0.5, "grad_norm": 1.028650665731286, "kl": 0.00011110305786132812, "learning_rate": 3.2887700534759353e-07, "loss": -0.017, "num_tokens": 448302.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.03345376689415228, "frac_reward_zero_std": 0.0, "grad_norm": 1.1438051189456147, "kl": 0.00010752677917480469, "learning_rate": 3.315508021390374e-07, "loss": 0.0911, "num_tokens": 452488.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.0337213970293055, "frac_reward_zero_std": 1.0, "grad_norm": 0.007670747163318072, "kl": 0.0002193450927734375, "learning_rate": 3.342245989304813e-07, "loss": 0.0, "num_tokens": 455825.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 360.5, "completions/mean_terminated_length": 360.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.033989027164458716, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075252196941216045, "kl": 0.00020313262939453125, "learning_rate": 3.3689839572192513e-07, "loss": 0.0, "num_tokens": 459797.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.034256657299611935, "frac_reward_zero_std": 1.0, "grad_norm": 1.5259957901722103, "kl": 0.0010712146759033203, "learning_rate": 3.39572192513369e-07, "loss": 0.0, "num_tokens": 463322.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 322.5, "completions/mean_terminated_length": 322.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.034524287434765155, "frac_reward_zero_std": 1.0, "grad_norm": 0.005176836762405492, "kl": 0.00013446807861328125, "learning_rate": 3.422459893048128e-07, "loss": 0.0, "num_tokens": 466990.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.034791917569918375, "frac_reward_zero_std": 1.0, "grad_norm": 0.007800667259715898, "kl": 0.00023317337036132812, "learning_rate": 3.449197860962567e-07, "loss": 0.0, "num_tokens": 470377.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.03505954770507159, "frac_reward_zero_std": 1.0, "grad_norm": 0.00633767438288052, "kl": 0.0001277923583984375, "learning_rate": 3.475935828877005e-07, "loss": 0.0, "num_tokens": 473607.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.03532717784022481, "frac_reward_zero_std": 0.5, "grad_norm": 0.816708427127644, "kl": 0.00013685226440429688, "learning_rate": 3.502673796791444e-07, "loss": 0.0511, "num_tokens": 477558.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 347.75, "completions/mean_terminated_length": 347.75, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.03559480797537803, "frac_reward_zero_std": 1.0, "grad_norm": 0.0071521273974093965, "kl": 0.00017833709716796875, "learning_rate": 3.529411764705882e-07, "loss": 0.0, "num_tokens": 481480.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.03586243811053125, "frac_reward_zero_std": 0.5, "grad_norm": 0.890138783588357, "kl": 0.0001583099365234375, "learning_rate": 3.5561497326203206e-07, "loss": 0.0647, "num_tokens": 484943.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.03613006824568447, "frac_reward_zero_std": 1.0, "grad_norm": 0.0341049503813328, "kl": 0.0003428459167480469, "learning_rate": 3.582887700534759e-07, "loss": 0.0, "num_tokens": 487852.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.03639769838083768, "frac_reward_zero_std": 1.0, "grad_norm": 0.005846010717995214, "kl": 0.00013017654418945312, "learning_rate": 3.609625668449198e-07, "loss": 0.0, "num_tokens": 491382.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.0366653285159909, "frac_reward_zero_std": 1.0, "grad_norm": 0.0028869224730762714, "kl": 0.00010800361633300781, "learning_rate": 3.636363636363636e-07, "loss": 0.0, "num_tokens": 495004.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.03693295865114412, "frac_reward_zero_std": 0.5, "grad_norm": 0.7582559349674683, "kl": 0.00016427040100097656, "learning_rate": 3.663101604278075e-07, "loss": -0.0186, "num_tokens": 498318.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 361.5, "completions/mean_terminated_length": 361.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.03720058878629734, "frac_reward_zero_std": 1.0, "grad_norm": 0.004740410681663473, "kl": 0.00015115737915039062, "learning_rate": 3.689839572192513e-07, "loss": 0.0, "num_tokens": 502182.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 363.25, "completions/mean_terminated_length": 363.25, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.03746821892145055, "frac_reward_zero_std": 1.0, "grad_norm": 0.006388483498891861, "kl": 0.0001544952392578125, "learning_rate": 3.7165775401069515e-07, "loss": 0.0, "num_tokens": 506056.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.03773584905660377, "frac_reward_zero_std": 1.0, "grad_norm": 0.004831378335668665, "kl": 8.219480514526367e-05, "learning_rate": 3.7433155080213904e-07, "loss": 0.0, "num_tokens": 509138.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 260.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.03800347919175699, "frac_reward_zero_std": 1.0, "grad_norm": 0.00723894527626242, "kl": 0.00010895729064941406, "learning_rate": 3.7700534759358287e-07, "loss": 0.0, "num_tokens": 512304.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 348.125, "completions/mean_terminated_length": 348.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.03827110932691021, "frac_reward_zero_std": 1.0, "grad_norm": 0.00633610941022107, "kl": 0.00013828277587890625, "learning_rate": 3.7967914438502675e-07, "loss": 0.0, "num_tokens": 516137.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.03853873946206343, "frac_reward_zero_std": 0.5, "grad_norm": 0.9086707820103591, "kl": 0.00016546249389648438, "learning_rate": 3.8235294117647053e-07, "loss": -0.0794, "num_tokens": 519415.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 390.875, "completions/mean_terminated_length": 390.875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.038806369597216644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0048947916151769345, "kl": 0.00015163421630859375, "learning_rate": 3.850267379679144e-07, "loss": 0.0, "num_tokens": 523506.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.039073999732369864, "frac_reward_zero_std": 1.0, "grad_norm": 0.004107375129999557, "kl": 9.822845458984375e-05, "learning_rate": 3.8770053475935825e-07, "loss": 0.0, "num_tokens": 526487.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.039341629867523084, "frac_reward_zero_std": 1.0, "grad_norm": 0.004215765038854095, "kl": 0.0001323223114013672, "learning_rate": 3.9037433155080213e-07, "loss": 0.0, "num_tokens": 529892.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.039609260002676304, "frac_reward_zero_std": 0.5, "grad_norm": 4.949558090569761, "kl": 0.00022411346435546875, "learning_rate": 3.9304812834224597e-07, "loss": -0.0448, "num_tokens": 532876.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.03987689013782952, "frac_reward_zero_std": 1.0, "grad_norm": 0.005403736831930975, "kl": 0.00011396408081054688, "learning_rate": 3.9572192513368985e-07, "loss": 0.0, "num_tokens": 536406.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.04014452027298274, "frac_reward_zero_std": 1.0, "grad_norm": 0.02215202652734927, "kl": 0.00024199485778808594, "learning_rate": 3.9839572192513363e-07, "loss": 0.0, "num_tokens": 539181.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 318.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.040412150408135956, "frac_reward_zero_std": 1.0, "grad_norm": 0.006768704417391081, "kl": 0.0001926422119140625, "learning_rate": 4.010695187165775e-07, "loss": 0.0, "num_tokens": 542726.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 360.375, "completions/mean_terminated_length": 360.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.040679780543289176, "frac_reward_zero_std": 0.5, "grad_norm": 0.9697986930467597, "kl": 0.000171661376953125, "learning_rate": 4.037433155080214e-07, "loss": 0.0077, "num_tokens": 547065.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.040947410678442396, "frac_reward_zero_std": 1.0, "grad_norm": 0.011818142214701278, "kl": 0.00017547607421875, "learning_rate": 4.0641711229946523e-07, "loss": 0.0, "num_tokens": 550190.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.04121504081359561, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068964649174858535, "kl": 0.00021028518676757812, "learning_rate": 4.090909090909091e-07, "loss": 0.0, "num_tokens": 554475.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.04148267094874883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0098227567824666, "kl": 0.0002727508544921875, "learning_rate": 4.117647058823529e-07, "loss": 0.0, "num_tokens": 557391.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.04175030108390205, "frac_reward_zero_std": 0.5, "grad_norm": 0.9047565958225845, "kl": 0.00019693374633789062, "learning_rate": 4.144385026737968e-07, "loss": 0.0061, "num_tokens": 561773.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.04201793121905527, "frac_reward_zero_std": 1.0, "grad_norm": 0.007199467207812221, "kl": 0.00010609626770019531, "learning_rate": 4.171122994652406e-07, "loss": 0.0, "num_tokens": 564845.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 316.375, "completions/mean_terminated_length": 316.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.04228556135420848, "frac_reward_zero_std": 1.0, "grad_norm": 0.009709309278572626, "kl": 0.0002033710479736328, "learning_rate": 4.197860962566845e-07, "loss": 0.0, "num_tokens": 568360.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.0425531914893617, "frac_reward_zero_std": 0.5, "grad_norm": 0.9047044197562589, "kl": 0.0001735687255859375, "learning_rate": 4.2245989304812833e-07, "loss": -0.0041, "num_tokens": 572411.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 262.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.04282082162451492, "frac_reward_zero_std": 1.0, "grad_norm": 0.00443813296334193, "kl": 0.00010156631469726562, "learning_rate": 4.2513368983957216e-07, "loss": 0.0, "num_tokens": 575655.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.04308845175966814, "frac_reward_zero_std": 0.5, "grad_norm": 0.7808692981997745, "kl": 0.0002117156982421875, "learning_rate": 4.27807486631016e-07, "loss": -0.0803, "num_tokens": 579543.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 298.625, "completions/mean_terminated_length": 298.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.043356081894821354, "frac_reward_zero_std": 1.0, "grad_norm": 0.007054676872048974, "kl": 0.0001983642578125, "learning_rate": 4.304812834224599e-07, "loss": 0.0, "num_tokens": 582900.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.04362371202997457, "frac_reward_zero_std": 1.0, "grad_norm": 0.009326995786631825, "kl": 0.00011086463928222656, "learning_rate": 4.3315508021390376e-07, "loss": 0.0, "num_tokens": 585483.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 408.125, "completions/mean_terminated_length": 408.125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.04389134216512779, "frac_reward_zero_std": 1.0, "grad_norm": 0.01038421995533856, "kl": 0.00015592575073242188, "learning_rate": 4.358288770053476e-07, "loss": 0.0, "num_tokens": 589872.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 358.375, "completions/mean_terminated_length": 358.375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.04415897230028101, "frac_reward_zero_std": 0.5, "grad_norm": 0.83549680202736, "kl": 0.00019121170043945312, "learning_rate": 4.385026737967914e-07, "loss": 0.0062, "num_tokens": 593791.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.04442660243543423, "frac_reward_zero_std": 1.0, "grad_norm": 0.006632818626137147, "kl": 0.00014019012451171875, "learning_rate": 4.4117647058823526e-07, "loss": 0.0, "num_tokens": 597845.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 400.625, "completions/mean_terminated_length": 400.625, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.044694232570587446, "frac_reward_zero_std": 1.0, "grad_norm": 0.006337557994832609, "kl": 0.00016260147094726562, "learning_rate": 4.4385026737967914e-07, "loss": 0.0, "num_tokens": 602214.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 450.0, "completions/mean_terminated_length": 450.0, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 0.044961862705740666, "frac_reward_zero_std": 1.0, "grad_norm": 0.006366393287399143, "kl": 0.00015401840209960938, "learning_rate": 4.4652406417112297e-07, "loss": 0.0, "num_tokens": 606890.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.045229492840893885, "frac_reward_zero_std": 1.0, "grad_norm": 0.005227526795631353, "kl": 0.00010609626770019531, "learning_rate": 4.4919786096256686e-07, "loss": 0.0, "num_tokens": 610246.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 262.25, "completions/mean_terminated_length": 262.25, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.045497122976047105, "frac_reward_zero_std": 1.0, "grad_norm": 0.020066924858012328, "kl": 0.00021886825561523438, "learning_rate": 4.5187165775401064e-07, "loss": 0.0, "num_tokens": 613420.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.04576475311120032, "frac_reward_zero_std": 1.0, "grad_norm": 0.02028176745239532, "kl": 0.00026035308837890625, "learning_rate": 4.545454545454545e-07, "loss": 0.0, "num_tokens": 616087.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.04603238324635354, "frac_reward_zero_std": 1.0, "grad_norm": 0.005964145853471373, "kl": 0.00018310546875, "learning_rate": 4.5721925133689835e-07, "loss": 0.0, "num_tokens": 619393.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 424.0, "completions/mean_terminated_length": 424.0, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.04630001338150676, "frac_reward_zero_std": 1.0, "grad_norm": 0.005939417909409115, "kl": 0.00023698806762695312, "learning_rate": 4.5989304812834224e-07, "loss": 0.0, "num_tokens": 623965.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.04656764351665998, "frac_reward_zero_std": 0.5, "grad_norm": 0.7757402458652128, "kl": 0.00022220611572265625, "learning_rate": 4.6256684491978607e-07, "loss": 0.0089, "num_tokens": 627686.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.0468352736518132, "frac_reward_zero_std": 1.0, "grad_norm": 0.004868899596288153, "kl": 7.843971252441406e-05, "learning_rate": 4.6524064171122995e-07, "loss": 0.0, "num_tokens": 631207.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 392.875, "completions/mean_terminated_length": 392.875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.04710290378696641, "frac_reward_zero_std": 0.5, "grad_norm": 0.6504748035543542, "kl": 0.00012826919555664062, "learning_rate": 4.679144385026738e-07, "loss": 0.0, "num_tokens": 635394.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.04737053392211963, "frac_reward_zero_std": 0.5, "grad_norm": 0.9453404068152775, "kl": 0.0001735687255859375, "learning_rate": 4.705882352941176e-07, "loss": -0.0381, "num_tokens": 638943.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 356.875, "completions/mean_terminated_length": 356.875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.04763816405727285, "frac_reward_zero_std": 1.0, "grad_norm": 0.007663857546893542, "kl": 0.0001049041748046875, "learning_rate": 4.732620320855615e-07, "loss": 0.0, "num_tokens": 642970.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 305.0, "completions/mean_terminated_length": 305.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.04790579419242607, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036570534377656444, "kl": 8.606910705566406e-05, "learning_rate": 4.7593582887700533e-07, "loss": 0.0, "num_tokens": 646402.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.04817342432757928, "frac_reward_zero_std": 1.0, "grad_norm": 0.006078697656987674, "kl": 0.0001773834228515625, "learning_rate": 4.786096256684492e-07, "loss": 0.0, "num_tokens": 650599.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.0484410544627325, "frac_reward_zero_std": 0.5, "grad_norm": 0.6895246084084545, "kl": 0.00032806396484375, "learning_rate": 4.81283422459893e-07, "loss": 0.0006, "num_tokens": 654332.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.04870868459788572, "frac_reward_zero_std": 1.0, "grad_norm": 0.007641387881218661, "kl": 0.00017452239990234375, "learning_rate": 4.839572192513369e-07, "loss": 0.0, "num_tokens": 657425.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.04897631473303894, "frac_reward_zero_std": 1.0, "grad_norm": 0.005088354465565702, "kl": 8.940696716308594e-05, "learning_rate": 4.866310160427808e-07, "loss": 0.0, "num_tokens": 660429.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.04924394486819216, "frac_reward_zero_std": 1.0, "grad_norm": 0.011206530387947753, "kl": 0.0002570152282714844, "learning_rate": 4.893048128342245e-07, "loss": 0.0, "num_tokens": 663216.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.049511575003345375, "frac_reward_zero_std": 1.0, "grad_norm": 0.007483493361906662, "kl": 0.00013589859008789062, "learning_rate": 4.919786096256684e-07, "loss": 0.0, "num_tokens": 666085.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 345.875, "completions/mean_terminated_length": 345.875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.049779205138498595, "frac_reward_zero_std": 1.0, "grad_norm": 0.00530147445063562, "kl": 0.0001163482666015625, "learning_rate": 4.946524064171122e-07, "loss": 0.0, "num_tokens": 670076.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 371.625, "completions/mean_terminated_length": 371.625, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.050046835273651814, "frac_reward_zero_std": 1.0, "grad_norm": 0.008962352607476931, "kl": 0.0001888275146484375, "learning_rate": 4.973262032085561e-07, "loss": 0.0, "num_tokens": 674085.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 447.25, "completions/mean_terminated_length": 447.25, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.050314465408805034, "frac_reward_zero_std": 0.5, "grad_norm": 0.4362603087807011, "kl": 0.00013637542724609375, "learning_rate": 5e-07, "loss": -0.0971, "num_tokens": 678803.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.05058209554395825, "frac_reward_zero_std": 1.0, "grad_norm": 0.005593736627119097, "kl": 0.00015115737915039062, "learning_rate": 5.026737967914439e-07, "loss": 0.0, "num_tokens": 681830.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.05084972567911147, "frac_reward_zero_std": 1.0, "grad_norm": 0.004431137522100464, "kl": 0.00012302398681640625, "learning_rate": 5.053475935828877e-07, "loss": 0.0, "num_tokens": 684689.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.05111735581426469, "frac_reward_zero_std": 1.0, "grad_norm": 0.015531992072548712, "kl": 0.00019311904907226562, "learning_rate": 5.080213903743315e-07, "loss": 0.0, "num_tokens": 687392.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 296.0, "completions/mean_terminated_length": 296.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.051384985949417906, "frac_reward_zero_std": 0.5, "grad_norm": 0.5949354528055708, "kl": 0.00012922286987304688, "learning_rate": 5.106951871657754e-07, "loss": 0.0258, "num_tokens": 690788.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 384.75, "completions/mean_terminated_length": 384.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.05165261608457112, "frac_reward_zero_std": 1.0, "grad_norm": 0.00487520356695263, "kl": 0.00011849403381347656, "learning_rate": 5.133689839572193e-07, "loss": 0.0, "num_tokens": 694894.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.05192024621972434, "frac_reward_zero_std": 1.0, "grad_norm": 0.004491229063532631, "kl": 8.678436279296875e-05, "learning_rate": 5.160427807486631e-07, "loss": 0.0, "num_tokens": 697963.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.05218787635487756, "frac_reward_zero_std": 0.5, "grad_norm": 0.7459110514835956, "kl": 7.915496826171875e-05, "learning_rate": 5.187165775401069e-07, "loss": -0.0153, "num_tokens": 701326.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.05245550649003078, "frac_reward_zero_std": 1.0, "grad_norm": 0.0104315175068788, "kl": 0.0001068115234375, "learning_rate": 5.213903743315507e-07, "loss": 0.0, "num_tokens": 704530.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 373.25, "completions/mean_terminated_length": 373.25, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.052723136625184, "frac_reward_zero_std": 1.0, "grad_norm": 0.004002787580786172, "kl": 9.107589721679688e-05, "learning_rate": 5.240641711229946e-07, "loss": 0.0, "num_tokens": 708620.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.05299076676033721, "frac_reward_zero_std": 1.0, "grad_norm": 0.014952923684259075, "kl": 0.00012493133544921875, "learning_rate": 5.267379679144385e-07, "loss": 0.0, "num_tokens": 711579.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 316.875, "completions/mean_terminated_length": 316.875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.05325839689549043, "frac_reward_zero_std": 1.0, "grad_norm": 0.004058332673947101, "kl": 8.368492126464844e-05, "learning_rate": 5.294117647058823e-07, "loss": 0.0, "num_tokens": 715082.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.05352602703064365, "frac_reward_zero_std": 1.0, "grad_norm": 0.01008105193429187, "kl": 0.00025177001953125, "learning_rate": 5.320855614973262e-07, "loss": 0.0, "num_tokens": 718975.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 359.5, "completions/mean_terminated_length": 359.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.05379365716579687, "frac_reward_zero_std": 1.0, "grad_norm": 0.003943606370951632, "kl": 0.00010716915130615234, "learning_rate": 5.347593582887701e-07, "loss": 0.0, "num_tokens": 722883.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 486.625, "completions/mean_terminated_length": 486.625, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.054061287300950084, "frac_reward_zero_std": 1.0, "grad_norm": 0.012891998896361393, "kl": 0.0002474784851074219, "learning_rate": 5.374331550802139e-07, "loss": 0.0, "num_tokens": 728020.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.054328917436103304, "frac_reward_zero_std": 1.0, "grad_norm": 0.004665746580532527, "kl": 0.000141143798828125, "learning_rate": 5.401069518716578e-07, "loss": 0.0, "num_tokens": 730971.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.05459654757125652, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038602200878986364, "kl": 3.7163496017456055e-05, "learning_rate": 5.427807486631015e-07, "loss": 0.0, "num_tokens": 734181.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 380.75, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.05486417770640974, "frac_reward_zero_std": 1.0, "grad_norm": 0.004434475817457214, "kl": 0.000171661376953125, "learning_rate": 5.454545454545454e-07, "loss": 0.0, "num_tokens": 738271.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.05513180784156296, "frac_reward_zero_std": 1.0, "grad_norm": 0.006265830441284333, "kl": 0.00018644332885742188, "learning_rate": 5.481283422459893e-07, "loss": 0.0, "num_tokens": 741480.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.055399437976716176, "frac_reward_zero_std": 1.0, "grad_norm": 0.004669805592768584, "kl": 3.9458274841308594e-05, "learning_rate": 5.508021390374332e-07, "loss": 0.0, "num_tokens": 744389.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 209.125, "completions/mean_terminated_length": 209.125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.055667068111869396, "frac_reward_zero_std": 1.0, "grad_norm": 0.006012891553055911, "kl": 0.00012636184692382812, "learning_rate": 5.534759358288769e-07, "loss": 0.0, "num_tokens": 747046.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 215.625, "completions/mean_terminated_length": 215.625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.055934698247022616, "frac_reward_zero_std": 1.0, "grad_norm": 0.005824708442866992, "kl": 0.00014925003051757812, "learning_rate": 5.561497326203208e-07, "loss": 0.0, "num_tokens": 749867.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.056202328382175835, "frac_reward_zero_std": 1.0, "grad_norm": 0.010956142325778734, "kl": 0.00022125244140625, "learning_rate": 5.588235294117647e-07, "loss": 0.0, "num_tokens": 753007.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.05646995851732905, "frac_reward_zero_std": 1.0, "grad_norm": 0.008765099278123297, "kl": 0.00016546249389648438, "learning_rate": 5.614973262032086e-07, "loss": 0.0, "num_tokens": 756203.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 335.375, "completions/mean_terminated_length": 335.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.05673758865248227, "frac_reward_zero_std": 1.0, "grad_norm": 0.006930294398256791, "kl": 0.0001518726348876953, "learning_rate": 5.641711229946524e-07, "loss": 0.0, "num_tokens": 759914.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 378.25, "completions/mean_terminated_length": 378.25, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.05700521878763549, "frac_reward_zero_std": 0.5, "grad_norm": 0.7458099928666945, "kl": 0.00022983551025390625, "learning_rate": 5.668449197860962e-07, "loss": -0.0269, "num_tokens": 764064.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 266.375, "completions/mean_terminated_length": 266.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.05727284892278871, "frac_reward_zero_std": 1.0, "grad_norm": 0.003983890848308483, "kl": 8.058547973632812e-05, "learning_rate": 5.6951871657754e-07, "loss": 0.0, "num_tokens": 767183.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.05754047905794193, "frac_reward_zero_std": 1.0, "grad_norm": 0.006671077019418383, "kl": 0.0002155303955078125, "learning_rate": 5.721925133689839e-07, "loss": 0.0, "num_tokens": 770686.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 335.25, "completions/mean_terminated_length": 335.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.05780810919309514, "frac_reward_zero_std": 1.0, "grad_norm": 0.00399293246368071, "kl": 7.772445678710938e-05, "learning_rate": 5.748663101604278e-07, "loss": 0.0, "num_tokens": 774520.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.05807573932824836, "frac_reward_zero_std": 1.0, "grad_norm": 0.00919363887304796, "kl": 0.00013566017150878906, "learning_rate": 5.775401069518716e-07, "loss": 0.0, "num_tokens": 779183.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.05834336946340158, "frac_reward_zero_std": 1.0, "grad_norm": 0.018430441004121736, "kl": 0.00019311904907226562, "learning_rate": 5.802139037433155e-07, "loss": 0.0, "num_tokens": 782621.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 276.75, "completions/mean_terminated_length": 276.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.0586109995985548, "frac_reward_zero_std": 0.5, "grad_norm": 1.007320870541755, "kl": 8.273124694824219e-05, "learning_rate": 5.828877005347593e-07, "loss": 0.06, "num_tokens": 786035.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.05887862973370801, "frac_reward_zero_std": 1.0, "grad_norm": 0.006374713977645209, "kl": 0.00015044212341308594, "learning_rate": 5.855614973262032e-07, "loss": 0.0, "num_tokens": 788877.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.05914625986886123, "frac_reward_zero_std": 0.5, "grad_norm": 0.9166882674179853, "kl": 0.00028705596923828125, "learning_rate": 5.88235294117647e-07, "loss": -0.0214, "num_tokens": 792693.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 353.0, "completions/mean_terminated_length": 353.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.05941389000401445, "frac_reward_zero_std": 0.0, "grad_norm": 1.2786312014696624, "kl": 0.00014066696166992188, "learning_rate": 5.909090909090909e-07, "loss": -0.0202, "num_tokens": 796565.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 189.5, "completions/mean_terminated_length": 189.5, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.05968152013916767, "frac_reward_zero_std": 1.0, "grad_norm": 0.004195659626185833, "kl": 6.318092346191406e-05, "learning_rate": 5.935828877005348e-07, "loss": 0.0, "num_tokens": 798945.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 371.0, "completions/mean_terminated_length": 371.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.059949150274320885, "frac_reward_zero_std": 1.0, "grad_norm": 0.007553677797526544, "kl": 0.00017976760864257812, "learning_rate": 5.962566844919787e-07, "loss": 0.0, "num_tokens": 803085.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 381.0, "completions/mean_terminated_length": 381.0, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.060216780409474105, "frac_reward_zero_std": 1.0, "grad_norm": 0.0047762353764222755, "kl": 0.00011491775512695312, "learning_rate": 5.989304812834223e-07, "loss": 0.0, "num_tokens": 807269.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.060484410544627325, "frac_reward_zero_std": 1.0, "grad_norm": 0.014116840701899275, "kl": 0.00025177001953125, "learning_rate": 6.016042780748662e-07, "loss": 0.0, "num_tokens": 810408.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 368.625, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.060752040679780545, "frac_reward_zero_std": 1.0, "grad_norm": 0.007364823157738992, "kl": 9.083747863769531e-05, "learning_rate": 6.042780748663101e-07, "loss": 0.0, "num_tokens": 814481.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 313.25, "completions/mean_terminated_length": 313.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.061019670814933764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066974491966240845, "kl": 0.00020074844360351562, "learning_rate": 6.06951871657754e-07, "loss": 0.0, "num_tokens": 817943.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 294.375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.06128730095008698, "frac_reward_zero_std": 1.0, "grad_norm": 0.015856673978720353, "kl": 0.00016427040100097656, "learning_rate": 6.096256684491979e-07, "loss": 0.0, "num_tokens": 821258.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.0615549310852402, "frac_reward_zero_std": 1.0, "grad_norm": 0.015989067021657803, "kl": 0.000278472900390625, "learning_rate": 6.122994652406417e-07, "loss": 0.0, "num_tokens": 824918.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.06182256122039342, "frac_reward_zero_std": 1.0, "grad_norm": 0.004711108472080869, "kl": 0.00012493133544921875, "learning_rate": 6.149732620320855e-07, "loss": 0.0, "num_tokens": 828715.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.06209019135554664, "frac_reward_zero_std": 1.0, "grad_norm": 0.008423910630775655, "kl": 0.00020503997802734375, "learning_rate": 6.176470588235294e-07, "loss": 0.0, "num_tokens": 832765.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.06235782149069985, "frac_reward_zero_std": 1.0, "grad_norm": 0.009849224544744982, "kl": 0.00019359588623046875, "learning_rate": 6.203208556149733e-07, "loss": 0.0, "num_tokens": 835899.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.06262545162585308, "frac_reward_zero_std": 0.5, "grad_norm": 0.9271266544813814, "kl": 0.00032329559326171875, "learning_rate": 6.229946524064171e-07, "loss": 0.0426, "num_tokens": 839297.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 309.75, "completions/mean_terminated_length": 309.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.06289308176100629, "frac_reward_zero_std": 1.0, "grad_norm": 0.012305896920987484, "kl": 0.0002589225769042969, "learning_rate": 6.256684491978609e-07, "loss": 0.0, "num_tokens": 842683.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 283.75, "completions/mean_terminated_length": 283.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.0631607118961595, "frac_reward_zero_std": 1.0, "grad_norm": 0.006168196825767285, "kl": 0.0001277923583984375, "learning_rate": 6.283422459893047e-07, "loss": 0.0, "num_tokens": 845925.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 522.375, "completions/mean_terminated_length": 522.375, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.06342834203131273, "frac_reward_zero_std": 0.5, "grad_norm": 0.5950261841273348, "kl": 0.00016546249389648438, "learning_rate": 6.310160427807486e-07, "loss": 0.0125, "num_tokens": 851532.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 549.75, "completions/mean_terminated_length": 482.0000305175781, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 0.06369597216646594, "frac_reward_zero_std": 0.5, "grad_norm": 0.5823147772981686, "kl": 0.00023746490478515625, "learning_rate": 6.336898395721924e-07, "loss": 0.1191, "num_tokens": 856978.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 308.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.06396360230161917, "frac_reward_zero_std": 1.0, "grad_norm": 0.019570442357241324, "kl": 0.00035381317138671875, "learning_rate": 6.363636363636363e-07, "loss": 0.0, "num_tokens": 860417.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.06423123243677238, "frac_reward_zero_std": 1.0, "grad_norm": 0.007594613661055187, "kl": 0.00017309188842773438, "learning_rate": 6.390374331550802e-07, "loss": 0.0, "num_tokens": 863355.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 284.875, "completions/mean_terminated_length": 284.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.0644988625719256, "frac_reward_zero_std": 1.0, "grad_norm": 0.02272708849936907, "kl": 0.0003337860107421875, "learning_rate": 6.417112299465241e-07, "loss": 0.0, "num_tokens": 866666.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 619.0, "completions/max_terminated_length": 619.0, "completions/mean_length": 410.875, "completions/mean_terminated_length": 410.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.06476649270707882, "frac_reward_zero_std": 0.5, "grad_norm": 0.5716393247917944, "kl": 0.00023937225341796875, "learning_rate": 6.44385026737968e-07, "loss": -0.0053, "num_tokens": 871061.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.06503412284223203, "frac_reward_zero_std": 1.0, "grad_norm": 0.023875731675797953, "kl": 0.00041484832763671875, "learning_rate": 6.470588235294117e-07, "loss": 0.0, "num_tokens": 874680.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 329.375, "completions/mean_terminated_length": 329.375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.06530175297738525, "frac_reward_zero_std": 1.0, "grad_norm": 0.005785280050929685, "kl": 0.00017309188842773438, "learning_rate": 6.497326203208556e-07, "loss": 0.0, "num_tokens": 878611.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 387.375, "completions/mean_terminated_length": 387.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.06556938311253847, "frac_reward_zero_std": 1.0, "grad_norm": 0.003275515575731135, "kl": 4.76837158203125e-05, "learning_rate": 6.524064171122995e-07, "loss": 0.0, "num_tokens": 882786.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 283.75, "completions/mean_terminated_length": 283.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.06583701324769169, "frac_reward_zero_std": 1.0, "grad_norm": 0.07460914350387109, "kl": 0.0008335113525390625, "learning_rate": 6.550802139037433e-07, "loss": 0.0, "num_tokens": 886204.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.06610464338284491, "frac_reward_zero_std": 1.0, "grad_norm": 0.028108095449884782, "kl": 0.0003123283386230469, "learning_rate": 6.577540106951871e-07, "loss": 0.0, "num_tokens": 889268.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 294.0, "completions/mean_terminated_length": 294.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.06637227351799813, "frac_reward_zero_std": 1.0, "grad_norm": 0.008495078396081907, "kl": 0.00022411346435546875, "learning_rate": 6.604278074866309e-07, "loss": 0.0, "num_tokens": 892624.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 483.125, "completions/mean_terminated_length": 483.125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.06663990365315134, "frac_reward_zero_std": 1.0, "grad_norm": 0.007432576125937589, "kl": 0.0001666545867919922, "learning_rate": 6.631016042780748e-07, "loss": 0.0, "num_tokens": 897649.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.06690753378830457, "frac_reward_zero_std": 0.5, "grad_norm": 1.2731052272096943, "kl": 0.00036907196044921875, "learning_rate": 6.657754010695187e-07, "loss": 0.0141, "num_tokens": 901392.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 292.75, "completions/mean_terminated_length": 292.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.06717516392345778, "frac_reward_zero_std": 1.0, "grad_norm": 0.02416508434116002, "kl": 0.00019550323486328125, "learning_rate": 6.684491978609626e-07, "loss": 0.0, "num_tokens": 904730.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.067442794058611, "frac_reward_zero_std": 1.0, "grad_norm": 0.011267819132923233, "kl": 0.00030422210693359375, "learning_rate": 6.711229946524064e-07, "loss": 0.0, "num_tokens": 908204.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.06771042419376422, "frac_reward_zero_std": 1.0, "grad_norm": 0.010289912147176075, "kl": 0.0002722740173339844, "learning_rate": 6.737967914438503e-07, "loss": 0.0, "num_tokens": 911427.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 336.75, "completions/mean_terminated_length": 336.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.06797805432891743, "frac_reward_zero_std": 0.5, "grad_norm": 0.9711612735911058, "kl": 0.0003376007080078125, "learning_rate": 6.764705882352941e-07, "loss": 0.009, "num_tokens": 915389.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 351.875, "completions/mean_terminated_length": 351.875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.06824568446407066, "frac_reward_zero_std": 1.0, "grad_norm": 0.05136862432538935, "kl": 0.00041961669921875, "learning_rate": 6.79144385026738e-07, "loss": 0.0, "num_tokens": 919376.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 219.375, "completions/mean_terminated_length": 219.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.06851331459922387, "frac_reward_zero_std": 0.5, "grad_norm": 1.540008751798709, "kl": 0.00030040740966796875, "learning_rate": 6.818181818181817e-07, "loss": 0.0006, "num_tokens": 922183.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.0687809447343771, "frac_reward_zero_std": 1.0, "grad_norm": 0.008886542555123288, "kl": 0.00020647048950195312, "learning_rate": 6.844919786096256e-07, "loss": 0.0, "num_tokens": 926170.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 341.5, "completions/mean_terminated_length": 341.5, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.06904857486953031, "frac_reward_zero_std": 0.5, "grad_norm": 0.9409503157079302, "kl": 0.00018262863159179688, "learning_rate": 6.871657754010695e-07, "loss": -0.0321, "num_tokens": 930258.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.06931620500468352, "frac_reward_zero_std": 0.5, "grad_norm": 0.9281746106118747, "kl": 0.00034332275390625, "learning_rate": 6.898395721925134e-07, "loss": 0.0218, "num_tokens": 933844.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 342.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.06958383513983675, "frac_reward_zero_std": 1.0, "grad_norm": 0.004896724456971208, "kl": 0.00010776519775390625, "learning_rate": 6.925133689839571e-07, "loss": 0.0, "num_tokens": 937657.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.06985146527498996, "frac_reward_zero_std": 1.0, "grad_norm": 0.004260487099084934, "kl": 8.893013000488281e-05, "learning_rate": 6.95187165775401e-07, "loss": 0.0, "num_tokens": 941017.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.07011909541014318, "frac_reward_zero_std": 1.0, "grad_norm": 0.01636481343699288, "kl": 0.00025463104248046875, "learning_rate": 6.978609625668449e-07, "loss": 0.0, "num_tokens": 944472.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.0703867255452964, "frac_reward_zero_std": 1.0, "grad_norm": 0.029267689786995694, "kl": 0.00036144256591796875, "learning_rate": 7.005347593582888e-07, "loss": 0.0, "num_tokens": 947702.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.07065435568044962, "frac_reward_zero_std": 1.0, "grad_norm": 0.008050054375139927, "kl": 0.00015807151794433594, "learning_rate": 7.032085561497327e-07, "loss": 0.0, "num_tokens": 951425.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.07092198581560284, "frac_reward_zero_std": 1.0, "grad_norm": 0.011374298294580514, "kl": 0.00023031234741210938, "learning_rate": 7.058823529411765e-07, "loss": 0.0, "num_tokens": 954773.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.07118961595075605, "frac_reward_zero_std": 1.0, "grad_norm": 0.037595892391628555, "kl": 0.000354766845703125, "learning_rate": 7.085561497326202e-07, "loss": 0.0, "num_tokens": 957962.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 318.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.07145724608590927, "frac_reward_zero_std": 1.0, "grad_norm": 0.005944572921958407, "kl": 0.00011873245239257812, "learning_rate": 7.112299465240641e-07, "loss": 0.0, "num_tokens": 961619.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 345.25, "completions/mean_terminated_length": 345.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.0717248762210625, "frac_reward_zero_std": 1.0, "grad_norm": 0.021046190765805516, "kl": 0.00026226043701171875, "learning_rate": 7.13903743315508e-07, "loss": 0.0, "num_tokens": 965529.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 235.625, "completions/mean_terminated_length": 235.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.07199250635621571, "frac_reward_zero_std": 1.0, "grad_norm": 0.0049942421005917705, "kl": 7.581710815429688e-05, "learning_rate": 7.165775401069518e-07, "loss": 0.0, "num_tokens": 968386.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 242.0, "completions/max_terminated_length": 242.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.07226013649136893, "frac_reward_zero_std": 1.0, "grad_norm": 0.006735703919601653, "kl": 5.14984130859375e-05, "learning_rate": 7.192513368983957e-07, "loss": 0.0, "num_tokens": 971281.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.07252776662652215, "frac_reward_zero_std": 1.0, "grad_norm": 0.00831403573225673, "kl": 0.000202178955078125, "learning_rate": 7.219251336898395e-07, "loss": 0.0, "num_tokens": 974491.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 331.25, "completions/mean_terminated_length": 331.25, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.07279539676167536, "frac_reward_zero_std": 0.5, "grad_norm": 0.8109057519675322, "kl": 0.00037288665771484375, "learning_rate": 7.245989304812834e-07, "loss": -0.0747, "num_tokens": 978185.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.07306302689682859, "frac_reward_zero_std": 1.0, "grad_norm": 0.005457088251919654, "kl": 0.00015735626220703125, "learning_rate": 7.272727272727272e-07, "loss": 0.0, "num_tokens": 981398.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.0733306570319818, "frac_reward_zero_std": 1.0, "grad_norm": 0.030336718143431557, "kl": 0.000278472900390625, "learning_rate": 7.299465240641711e-07, "loss": 0.0, "num_tokens": 984130.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.07359828716713501, "frac_reward_zero_std": 1.0, "grad_norm": 0.016036698442610346, "kl": 0.0003376007080078125, "learning_rate": 7.32620320855615e-07, "loss": 0.0, "num_tokens": 987467.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.07386591730228824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01268221938764149, "kl": 0.000202178955078125, "learning_rate": 7.352941176470589e-07, "loss": 0.0, "num_tokens": 990796.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.07413354743744145, "frac_reward_zero_std": 1.0, "grad_norm": 0.019813686478823755, "kl": 0.0002646446228027344, "learning_rate": 7.379679144385026e-07, "loss": 0.0, "num_tokens": 994177.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 249.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.07440117757259468, "frac_reward_zero_std": 1.0, "grad_norm": 0.01886375610467042, "kl": 0.0003986358642578125, "learning_rate": 7.406417112299464e-07, "loss": 0.0, "num_tokens": 997056.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 347.875, "completions/mean_terminated_length": 347.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.07466880770774789, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069869419216387315, "kl": 0.00016355514526367188, "learning_rate": 7.433155080213903e-07, "loss": 0.0, "num_tokens": 1000831.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.0749364378429011, "frac_reward_zero_std": 1.0, "grad_norm": 0.022902748411962374, "kl": 0.00035190582275390625, "learning_rate": 7.459893048128342e-07, "loss": 0.0, "num_tokens": 1003880.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.07520406797805433, "frac_reward_zero_std": 1.0, "grad_norm": 0.017641955342754364, "kl": 0.00026416778564453125, "learning_rate": 7.486631016042781e-07, "loss": 0.0, "num_tokens": 1007582.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 354.375, "completions/mean_terminated_length": 354.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.07547169811320754, "frac_reward_zero_std": 0.5, "grad_norm": 0.7562305302225105, "kl": 0.0001735687255859375, "learning_rate": 7.513368983957219e-07, "loss": 0.0, "num_tokens": 1011441.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.07573932824836077, "frac_reward_zero_std": 1.0, "grad_norm": 0.06418786906653354, "kl": 0.00084686279296875, "learning_rate": 7.540106951871657e-07, "loss": 0.0, "num_tokens": 1014389.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.07600695838351398, "frac_reward_zero_std": 1.0, "grad_norm": 0.004592717216860749, "kl": 9.703636169433594e-05, "learning_rate": 7.566844919786096e-07, "loss": 0.0, "num_tokens": 1017480.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.0762745885186672, "frac_reward_zero_std": 1.0, "grad_norm": 0.02538924936339737, "kl": 0.00036144256591796875, "learning_rate": 7.593582887700535e-07, "loss": 0.0, "num_tokens": 1020541.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 410.625, "completions/mean_terminated_length": 410.625, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.07654221865382042, "frac_reward_zero_std": 1.0, "grad_norm": 0.013414527527440125, "kl": 0.00032806396484375, "learning_rate": 7.620320855614973e-07, "loss": 0.0, "num_tokens": 1024902.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.07680984878897364, "frac_reward_zero_std": 1.0, "grad_norm": 0.02118899959861409, "kl": 0.0004439353942871094, "learning_rate": 7.647058823529411e-07, "loss": 0.0, "num_tokens": 1027981.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.07707747892412686, "frac_reward_zero_std": 1.0, "grad_norm": 0.01656695331930112, "kl": 0.0007343292236328125, "learning_rate": 7.67379679144385e-07, "loss": 0.0, "num_tokens": 1031650.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.07734510905928008, "frac_reward_zero_std": 1.0, "grad_norm": 0.01071136972763477, "kl": 0.00011563301086425781, "learning_rate": 7.700534759358288e-07, "loss": 0.0, "num_tokens": 1034563.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 379.625, "completions/mean_terminated_length": 379.625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.07761273919443329, "frac_reward_zero_std": 1.0, "grad_norm": 0.02483782016684795, "kl": 0.00021982192993164062, "learning_rate": 7.727272727272727e-07, "loss": 0.0, "num_tokens": 1038732.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 247.875, "completions/mean_terminated_length": 247.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.07788036932958652, "frac_reward_zero_std": 0.5, "grad_norm": 1.4720268311405567, "kl": 0.00038909912109375, "learning_rate": 7.754010695187165e-07, "loss": 0.0594, "num_tokens": 1041727.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.07814799946473973, "frac_reward_zero_std": 1.0, "grad_norm": 0.018489812188013106, "kl": 0.0004711151123046875, "learning_rate": 7.780748663101604e-07, "loss": 0.0, "num_tokens": 1044677.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 254.0, "completions/mean_terminated_length": 254.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.07841562959989294, "frac_reward_zero_std": 1.0, "grad_norm": 0.028804976949014648, "kl": 0.00021386146545410156, "learning_rate": 7.807486631016043e-07, "loss": 0.0, "num_tokens": 1047717.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 376.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.07868325973504617, "frac_reward_zero_std": 1.0, "grad_norm": 0.022851925792335255, "kl": 0.00039958953857421875, "learning_rate": 7.834224598930482e-07, "loss": 0.0, "num_tokens": 1051801.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.07895088987019938, "frac_reward_zero_std": 1.0, "grad_norm": 0.007264186635342242, "kl": 0.00013494491577148438, "learning_rate": 7.860962566844919e-07, "loss": 0.0, "num_tokens": 1055030.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.07921852000535261, "frac_reward_zero_std": 0.5, "grad_norm": 0.7003051007071395, "kl": 8.559226989746094e-05, "learning_rate": 7.887700534759358e-07, "loss": 0.0233, "num_tokens": 1058966.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.07948615014050582, "frac_reward_zero_std": 1.0, "grad_norm": 0.0067823362527270675, "kl": 0.00018596649169921875, "learning_rate": 7.914438502673797e-07, "loss": 0.0, "num_tokens": 1062434.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.07975378027565903, "frac_reward_zero_std": 1.0, "grad_norm": 0.03483375524219131, "kl": 0.0005750656127929688, "learning_rate": 7.941176470588235e-07, "loss": 0.0, "num_tokens": 1065194.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 269.375, "completions/mean_terminated_length": 269.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.08002141041081226, "frac_reward_zero_std": 1.0, "grad_norm": 0.02340756114077787, "kl": 0.0003714561462402344, "learning_rate": 7.967914438502673e-07, "loss": 0.0, "num_tokens": 1068501.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.08028904054596547, "frac_reward_zero_std": 0.5, "grad_norm": 0.7165310879505097, "kl": 0.00028896331787109375, "learning_rate": 7.994652406417111e-07, "loss": 0.0063, "num_tokens": 1071815.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 352.875, "completions/mean_terminated_length": 352.875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.0805566706811187, "frac_reward_zero_std": 1.0, "grad_norm": 0.013687768550565355, "kl": 0.000255584716796875, "learning_rate": 8.02139037433155e-07, "loss": 0.0, "num_tokens": 1075778.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 414.75, "completions/mean_terminated_length": 414.75, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.08082430081627191, "frac_reward_zero_std": 0.5, "grad_norm": 0.7063435720742741, "kl": 0.00019311904907226562, "learning_rate": 8.048128342245989e-07, "loss": 0.0209, "num_tokens": 1080208.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.08109193095142513, "frac_reward_zero_std": 1.0, "grad_norm": 0.024323230108674695, "kl": 0.00037384033203125, "learning_rate": 8.074866310160428e-07, "loss": 0.0, "num_tokens": 1084211.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 281.5, "completions/mean_terminated_length": 281.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.08135956108657835, "frac_reward_zero_std": 1.0, "grad_norm": 0.019943214390457834, "kl": 0.00029277801513671875, "learning_rate": 8.101604278074866e-07, "loss": 0.0, "num_tokens": 1087463.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.08162719122173157, "frac_reward_zero_std": 1.0, "grad_norm": 0.006859042955723195, "kl": 0.00022268295288085938, "learning_rate": 8.128342245989305e-07, "loss": 0.0, "num_tokens": 1090779.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 345.5, "completions/mean_terminated_length": 345.5, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.08189482135688479, "frac_reward_zero_std": 1.0, "grad_norm": 0.004889492950706858, "kl": 0.00011706352233886719, "learning_rate": 8.155080213903743e-07, "loss": 0.0, "num_tokens": 1094615.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.082162451492038, "frac_reward_zero_std": 1.0, "grad_norm": 0.007990326084863176, "kl": 0.0002884864807128906, "learning_rate": 8.181818181818182e-07, "loss": 0.0, "num_tokens": 1097602.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 270.75, "completions/mean_terminated_length": 270.75, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.08243008162719122, "frac_reward_zero_std": 1.0, "grad_norm": 0.053775148442485377, "kl": 0.000705718994140625, "learning_rate": 8.208556149732619e-07, "loss": 0.0, "num_tokens": 1100960.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 368.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.08269771176234444, "frac_reward_zero_std": 1.0, "grad_norm": 0.0041810930247835375, "kl": 0.0001678466796875, "learning_rate": 8.235294117647058e-07, "loss": 0.0, "num_tokens": 1105137.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 224.0, "completions/mean_terminated_length": 224.0, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.08296534189749766, "frac_reward_zero_std": 1.0, "grad_norm": 0.024294970187070695, "kl": 0.00038909912109375, "learning_rate": 8.262032085561497e-07, "loss": 0.0, "num_tokens": 1107885.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.08323297203265087, "frac_reward_zero_std": 1.0, "grad_norm": 0.007943237206798069, "kl": 0.00013399124145507812, "learning_rate": 8.288770053475936e-07, "loss": 0.0, "num_tokens": 1111529.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.0835006021678041, "frac_reward_zero_std": 0.5, "grad_norm": 0.7594367883841329, "kl": 0.00025177001953125, "learning_rate": 8.315508021390373e-07, "loss": 0.02, "num_tokens": 1114841.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.08376823230295731, "frac_reward_zero_std": 1.0, "grad_norm": 0.007176100778241826, "kl": 0.00022649765014648438, "learning_rate": 8.342245989304812e-07, "loss": 0.0, "num_tokens": 1117790.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.08403586243811054, "frac_reward_zero_std": 1.0, "grad_norm": 0.005395489361206473, "kl": 8.800625801086426e-05, "learning_rate": 8.368983957219251e-07, "loss": 0.0, "num_tokens": 1120608.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 256.375, "completions/mean_terminated_length": 256.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.08430349257326375, "frac_reward_zero_std": 1.0, "grad_norm": 0.004411242639520499, "kl": 0.0001232624053955078, "learning_rate": 8.39572192513369e-07, "loss": 0.0, "num_tokens": 1123679.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.08457112270841696, "frac_reward_zero_std": 1.0, "grad_norm": 0.011930360610640399, "kl": 0.00015974044799804688, "learning_rate": 8.422459893048129e-07, "loss": 0.0, "num_tokens": 1127317.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 273.375, "completions/mean_terminated_length": 273.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.08483875284357019, "frac_reward_zero_std": 1.0, "grad_norm": 0.0058045238673953385, "kl": 0.000209808349609375, "learning_rate": 8.449197860962567e-07, "loss": 0.0, "num_tokens": 1130496.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.0851063829787234, "frac_reward_zero_std": 1.0, "grad_norm": 0.007782728647152365, "kl": 0.000164031982421875, "learning_rate": 8.475935828877004e-07, "loss": 0.0, "num_tokens": 1133919.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.08537401311387663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0084035519513013, "kl": 0.00017690658569335938, "learning_rate": 8.502673796791443e-07, "loss": 0.0, "num_tokens": 1136876.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 287.125, "completions/mean_terminated_length": 287.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.08564164324902984, "frac_reward_zero_std": 1.0, "grad_norm": 0.007713287399142902, "kl": 0.0001461505889892578, "learning_rate": 8.529411764705882e-07, "loss": 0.0, "num_tokens": 1140269.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.08590927338418305, "frac_reward_zero_std": 1.0, "grad_norm": 0.007346538237129348, "kl": 0.00014209747314453125, "learning_rate": 8.55614973262032e-07, "loss": 0.0, "num_tokens": 1142799.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 364.5, "completions/mean_terminated_length": 364.5, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.08617690351933628, "frac_reward_zero_std": 0.5, "grad_norm": 0.7750861283246548, "kl": 0.00017261505126953125, "learning_rate": 8.582887700534759e-07, "loss": 0.0168, "num_tokens": 1146939.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.0864445336544895, "frac_reward_zero_std": 1.0, "grad_norm": 0.018713301272499604, "kl": 0.0002079010009765625, "learning_rate": 8.609625668449198e-07, "loss": 0.0, "num_tokens": 1149799.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 331.25, "completions/mean_terminated_length": 331.25, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.08671216378964271, "frac_reward_zero_std": 0.5, "grad_norm": 0.9745507439965534, "kl": 0.000396728515625, "learning_rate": 8.636363636363636e-07, "loss": 0.0444, "num_tokens": 1153641.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 484.875, "completions/mean_terminated_length": 484.875, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.08697979392479593, "frac_reward_zero_std": 0.5, "grad_norm": 1.6106046388639235, "kl": 0.00028514862060546875, "learning_rate": 8.663101604278075e-07, "loss": 0.0165, "num_tokens": 1158660.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.08724742405994915, "frac_reward_zero_std": 1.0, "grad_norm": 0.058068649732648275, "kl": 0.0006122589111328125, "learning_rate": 8.689839572192513e-07, "loss": 0.0, "num_tokens": 1161886.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.08751505419510237, "frac_reward_zero_std": 1.0, "grad_norm": 0.017092723481055543, "kl": 0.0003294944763183594, "learning_rate": 8.716577540106952e-07, "loss": 0.0, "num_tokens": 1165752.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.08778268433025559, "frac_reward_zero_std": 1.0, "grad_norm": 0.049518407011726835, "kl": 0.000530242919921875, "learning_rate": 8.743315508021391e-07, "loss": 0.0, "num_tokens": 1169308.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0880503144654088, "frac_reward_zero_std": 1.0, "grad_norm": 0.04482920169846318, "kl": 0.00066375732421875, "learning_rate": 8.770053475935828e-07, "loss": 0.0, "num_tokens": 1172150.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 362.5, "completions/mean_terminated_length": 362.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.08831794460056203, "frac_reward_zero_std": 1.0, "grad_norm": 0.01355966308014044, "kl": 0.0003299713134765625, "learning_rate": 8.796791443850266e-07, "loss": 0.0, "num_tokens": 1175998.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.08858557473571524, "frac_reward_zero_std": 1.0, "grad_norm": 0.017289826767746062, "kl": 0.000225067138671875, "learning_rate": 8.823529411764705e-07, "loss": 0.0, "num_tokens": 1179575.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.08885320487086847, "frac_reward_zero_std": 1.0, "grad_norm": 0.22454054638633017, "kl": 0.0015249252319335938, "learning_rate": 8.850267379679144e-07, "loss": 0.0001, "num_tokens": 1182764.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 347.625, "completions/mean_terminated_length": 347.625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.08912083500602168, "frac_reward_zero_std": 1.0, "grad_norm": 0.008776978485971504, "kl": 0.00021600723266601562, "learning_rate": 8.877005347593583e-07, "loss": 0.0, "num_tokens": 1186653.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 275.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.08938846514117489, "frac_reward_zero_std": 1.0, "grad_norm": 0.005865264513826276, "kl": 0.00017547607421875, "learning_rate": 8.903743315508021e-07, "loss": 0.0, "num_tokens": 1189737.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 308.625, "completions/mean_terminated_length": 308.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.08965609527632812, "frac_reward_zero_std": 1.0, "grad_norm": 0.006514452902857558, "kl": 0.00016164779663085938, "learning_rate": 8.930481283422459e-07, "loss": 0.0, "num_tokens": 1193314.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.08992372541148133, "frac_reward_zero_std": 1.0, "grad_norm": 0.006596801460954652, "kl": 0.00014972686767578125, "learning_rate": 8.957219251336898e-07, "loss": 0.0, "num_tokens": 1196376.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 407.375, "completions/mean_terminated_length": 407.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.09019135554663456, "frac_reward_zero_std": 1.0, "grad_norm": 0.005043444190680575, "kl": 0.00016641616821289062, "learning_rate": 8.983957219251337e-07, "loss": 0.0, "num_tokens": 1200891.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 266.375, "completions/mean_terminated_length": 266.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.09045898568178777, "frac_reward_zero_std": 1.0, "grad_norm": 0.013110878950321005, "kl": 0.00026702880859375, "learning_rate": 9.010695187165776e-07, "loss": 0.0, "num_tokens": 1204130.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 244.875, "completions/mean_terminated_length": 244.875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.09072661581694098, "frac_reward_zero_std": 1.0, "grad_norm": 0.008692085929411526, "kl": 0.0003018379211425781, "learning_rate": 9.037433155080213e-07, "loss": 0.0, "num_tokens": 1207097.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.09099424595209421, "frac_reward_zero_std": 1.0, "grad_norm": 0.015729279102842515, "kl": 0.00047969818115234375, "learning_rate": 9.064171122994652e-07, "loss": 0.0, "num_tokens": 1210580.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 330.875, "completions/mean_terminated_length": 330.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.09126187608724742, "frac_reward_zero_std": 0.5, "grad_norm": 0.5814864309937819, "kl": 0.00028228759765625, "learning_rate": 9.09090909090909e-07, "loss": -0.1285, "num_tokens": 1214359.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 478.625, "completions/mean_terminated_length": 478.625, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.09152950622240064, "frac_reward_zero_std": 1.0, "grad_norm": 0.004726561822688958, "kl": 0.00015163421630859375, "learning_rate": 9.117647058823529e-07, "loss": 0.0, "num_tokens": 1219276.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.09179713635755386, "frac_reward_zero_std": 1.0, "grad_norm": 0.008800514351528224, "kl": 0.00018978118896484375, "learning_rate": 9.144385026737967e-07, "loss": 0.0, "num_tokens": 1222719.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.09206476649270708, "frac_reward_zero_std": 1.0, "grad_norm": 0.023570277176361933, "kl": 0.00018453598022460938, "learning_rate": 9.171122994652406e-07, "loss": 0.0, "num_tokens": 1225435.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.0923323966278603, "frac_reward_zero_std": 1.0, "grad_norm": 0.016943727372149605, "kl": 0.00032901763916015625, "learning_rate": 9.197860962566845e-07, "loss": 0.0, "num_tokens": 1229190.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.09260002676301352, "frac_reward_zero_std": 1.0, "grad_norm": 0.01105956625705818, "kl": 0.0002899169921875, "learning_rate": 9.224598930481284e-07, "loss": 0.0, "num_tokens": 1232623.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 354.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.09286765689816673, "frac_reward_zero_std": 0.5, "grad_norm": 1.026565431153433, "kl": 0.000244140625, "learning_rate": 9.251336898395721e-07, "loss": 0.0155, "num_tokens": 1236515.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 204.0, "completions/mean_terminated_length": 204.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.09313528703331996, "frac_reward_zero_std": 1.0, "grad_norm": 0.014466417998747023, "kl": 0.0002532005310058594, "learning_rate": 9.27807486631016e-07, "loss": 0.0, "num_tokens": 1239115.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 372.125, "completions/mean_terminated_length": 372.125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.09340291716847317, "frac_reward_zero_std": 1.0, "grad_norm": 0.011887131821726401, "kl": 0.00029087066650390625, "learning_rate": 9.304812834224599e-07, "loss": 0.0, "num_tokens": 1243336.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.0936705473036264, "frac_reward_zero_std": 0.5, "grad_norm": 0.8674095879724265, "kl": 0.0003604888916015625, "learning_rate": 9.331550802139037e-07, "loss": 0.0446, "num_tokens": 1246636.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 371.75, "completions/mean_terminated_length": 371.75, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.09393817743877961, "frac_reward_zero_std": 1.0, "grad_norm": 0.014975191502576092, "kl": 0.00036334991455078125, "learning_rate": 9.358288770053476e-07, "loss": 0.0, "num_tokens": 1250934.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.09420580757393282, "frac_reward_zero_std": 1.0, "grad_norm": 0.051687526722627684, "kl": 0.0010232925415039062, "learning_rate": 9.385026737967913e-07, "loss": 0.0, "num_tokens": 1254374.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 390.25, "completions/mean_terminated_length": 390.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.09447343770908605, "frac_reward_zero_std": 1.0, "grad_norm": 0.007167666423965781, "kl": 0.00028133392333984375, "learning_rate": 9.411764705882352e-07, "loss": 0.0, "num_tokens": 1258944.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 348.125, "completions/mean_terminated_length": 348.125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.09474106784423926, "frac_reward_zero_std": 1.0, "grad_norm": 0.057267330781924454, "kl": 0.000980377197265625, "learning_rate": 9.438502673796791e-07, "loss": 0.0, "num_tokens": 1262777.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 232.375, "completions/mean_terminated_length": 232.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.09500869797939247, "frac_reward_zero_std": 1.0, "grad_norm": 0.003928694778420512, "kl": 0.00011181831359863281, "learning_rate": 9.46524064171123e-07, "loss": 0.0, "num_tokens": 1265632.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.0952763281145457, "frac_reward_zero_std": 1.0, "grad_norm": 0.05381610213878918, "kl": 0.001575469970703125, "learning_rate": 9.491978609625668e-07, "loss": 0.0001, "num_tokens": 1268953.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.09554395824969891, "frac_reward_zero_std": 1.0, "grad_norm": 0.018031320247219304, "kl": 0.0006542205810546875, "learning_rate": 9.518716577540107e-07, "loss": 0.0, "num_tokens": 1271975.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.09581158838485214, "frac_reward_zero_std": 1.0, "grad_norm": 0.035045880206485296, "kl": 0.0008640289306640625, "learning_rate": 9.545454545454546e-07, "loss": 0.0, "num_tokens": 1275244.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.09607921852000535, "frac_reward_zero_std": 1.0, "grad_norm": 0.008842796472147576, "kl": 0.00040912628173828125, "learning_rate": 9.572192513368984e-07, "loss": 0.0, "num_tokens": 1278709.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.09634684865515857, "frac_reward_zero_std": 1.0, "grad_norm": 0.09051113247345358, "kl": 0.00261688232421875, "learning_rate": 9.598930481283421e-07, "loss": 0.0001, "num_tokens": 1282515.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.09661447879031179, "frac_reward_zero_std": 1.0, "grad_norm": 0.03154112125471542, "kl": 0.0008754730224609375, "learning_rate": 9.62566844919786e-07, "loss": 0.0, "num_tokens": 1285576.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 226.75, "completions/mean_terminated_length": 226.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.096882108925465, "frac_reward_zero_std": 1.0, "grad_norm": 0.1111691877873292, "kl": 0.002880096435546875, "learning_rate": 9.652406417112299e-07, "loss": 0.0001, "num_tokens": 1288378.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.09714973906061823, "frac_reward_zero_std": 1.0, "grad_norm": 0.016740408124959825, "kl": 0.0005884170532226562, "learning_rate": 9.679144385026738e-07, "loss": 0.0, "num_tokens": 1291226.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.09741736919577144, "frac_reward_zero_std": 1.0, "grad_norm": 0.05094030651031297, "kl": 0.001148223876953125, "learning_rate": 9.705882352941176e-07, "loss": 0.0, "num_tokens": 1295387.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.09768499933092466, "frac_reward_zero_std": 1.0, "grad_norm": 0.012957979323986221, "kl": 0.0003314018249511719, "learning_rate": 9.732620320855615e-07, "loss": 0.0, "num_tokens": 1299349.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 358.25, "completions/mean_terminated_length": 358.25, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.09795262946607788, "frac_reward_zero_std": 1.0, "grad_norm": 0.058595399272166615, "kl": 0.001056671142578125, "learning_rate": 9.759358288770054e-07, "loss": 0.0, "num_tokens": 1303371.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 348.25, "completions/mean_terminated_length": 348.25, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.0982202596012311, "frac_reward_zero_std": 1.0, "grad_norm": 0.04274249043877871, "kl": 0.0006237030029296875, "learning_rate": 9.78609625668449e-07, "loss": 0.0, "num_tokens": 1307557.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.09848788973638432, "frac_reward_zero_std": 1.0, "grad_norm": 0.013151658664578991, "kl": 0.000415802001953125, "learning_rate": 9.81283422459893e-07, "loss": 0.0, "num_tokens": 1310196.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 295.25, "completions/mean_terminated_length": 295.25, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.09875551987153754, "frac_reward_zero_std": 1.0, "grad_norm": 0.028345915543584583, "kl": 0.0005855560302734375, "learning_rate": 9.839572192513369e-07, "loss": 0.0, "num_tokens": 1313658.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 276.75, "completions/mean_terminated_length": 276.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.09902315000669075, "frac_reward_zero_std": 1.0, "grad_norm": 0.044031534682649094, "kl": 0.00080108642578125, "learning_rate": 9.866310160427807e-07, "loss": 0.0, "num_tokens": 1316908.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.09929078014184398, "frac_reward_zero_std": 1.0, "grad_norm": 0.0437021459874804, "kl": 0.0007543563842773438, "learning_rate": 9.893048128342244e-07, "loss": 0.0, "num_tokens": 1320361.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 351.75, "completions/mean_terminated_length": 351.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.09955841027699719, "frac_reward_zero_std": 1.0, "grad_norm": 0.021585978878788107, "kl": 0.00046539306640625, "learning_rate": 9.919786096256683e-07, "loss": 0.0, "num_tokens": 1324363.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.0998260404121504, "frac_reward_zero_std": 1.0, "grad_norm": 0.013636220746786961, "kl": 0.00020551681518554688, "learning_rate": 9.946524064171122e-07, "loss": 0.0, "num_tokens": 1326857.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.10009367054730363, "frac_reward_zero_std": 0.5, "grad_norm": 1.0276009157963706, "kl": 0.0004730224609375, "learning_rate": 9.97326203208556e-07, "loss": -0.0092, "num_tokens": 1330338.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.10036130068245684, "frac_reward_zero_std": 1.0, "grad_norm": 0.006768830026545141, "kl": 0.000308990478515625, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1333883.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 327.5, "completions/mean_terminated_length": 327.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.10062893081761007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054794207236401405, "kl": 0.00016689300537109375, "learning_rate": 9.999998036510888e-07, "loss": 0.0, "num_tokens": 1337579.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.10089656095276328, "frac_reward_zero_std": 1.0, "grad_norm": 0.006213582940837, "kl": 0.00015974044799804688, "learning_rate": 9.999992146045266e-07, "loss": 0.0, "num_tokens": 1341530.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 249.0, "completions/mean_terminated_length": 249.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.1011641910879165, "frac_reward_zero_std": 1.0, "grad_norm": 0.01032049914663759, "kl": 0.00022268295288085938, "learning_rate": 9.999982328608274e-07, "loss": 0.0, "num_tokens": 1344626.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 414.25, "completions/mean_terminated_length": 414.25, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.10143182122306972, "frac_reward_zero_std": 0.5, "grad_norm": 0.7534997274370485, "kl": 0.00022554397583007812, "learning_rate": 9.99996858420848e-07, "loss": 0.0475, "num_tokens": 1348996.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 377.875, "completions/mean_terminated_length": 377.875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.10169945135822293, "frac_reward_zero_std": 1.0, "grad_norm": 0.005595881431151507, "kl": 0.00019216537475585938, "learning_rate": 9.99995091285788e-07, "loss": 0.0, "num_tokens": 1353487.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 316.875, "completions/mean_terminated_length": 316.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.10196708149337616, "frac_reward_zero_std": 1.0, "grad_norm": 0.016462551148523092, "kl": 0.00026988983154296875, "learning_rate": 9.99992931457189e-07, "loss": 0.0, "num_tokens": 1357262.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.10223471162852937, "frac_reward_zero_std": 0.5, "grad_norm": 0.9546841254084789, "kl": 0.00015020370483398438, "learning_rate": 9.999903789369362e-07, "loss": 0.0038, "num_tokens": 1361183.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.10250234176368259, "frac_reward_zero_std": 1.0, "grad_norm": 0.008097143279864905, "kl": 0.00023794174194335938, "learning_rate": 9.999874337272572e-07, "loss": 0.0, "num_tokens": 1364935.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 422.0, "completions/mean_terminated_length": 422.0, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.10276997189883581, "frac_reward_zero_std": 1.0, "grad_norm": 0.015361101607139717, "kl": 0.00021600723266601562, "learning_rate": 9.999840958307218e-07, "loss": 0.0, "num_tokens": 1369375.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.10303760203398903, "frac_reward_zero_std": 0.5, "grad_norm": 0.6564848164136472, "kl": 0.00025081634521484375, "learning_rate": 9.99980365250243e-07, "loss": -0.0122, "num_tokens": 1373648.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.10330523216914224, "frac_reward_zero_std": 1.0, "grad_norm": 0.015331679375777082, "kl": 0.0002841949462890625, "learning_rate": 9.999762419890764e-07, "loss": 0.0, "num_tokens": 1376698.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 391.5, "completions/mean_terminated_length": 391.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.10357286230429547, "frac_reward_zero_std": 1.0, "grad_norm": 0.020367414819771862, "kl": 0.0003833770751953125, "learning_rate": 9.9997172605082e-07, "loss": 0.0, "num_tokens": 1380958.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.10384049243944868, "frac_reward_zero_std": 1.0, "grad_norm": 0.010876126434525669, "kl": 0.0003256797790527344, "learning_rate": 9.99966817439415e-07, "loss": 0.0, "num_tokens": 1385260.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 276.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.1041081225746019, "frac_reward_zero_std": 1.0, "grad_norm": 0.040762032890712355, "kl": 0.0006656646728515625, "learning_rate": 9.99961516159145e-07, "loss": 0.0, "num_tokens": 1388559.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.10437575270975512, "frac_reward_zero_std": 1.0, "grad_norm": 0.02886801028694782, "kl": 0.00054931640625, "learning_rate": 9.999558222146356e-07, "loss": 0.0, "num_tokens": 1391880.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.10464338284490833, "frac_reward_zero_std": 1.0, "grad_norm": 0.025422903463396398, "kl": 0.0003509521484375, "learning_rate": 9.999497356108565e-07, "loss": 0.0, "num_tokens": 1394957.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.10491101298006156, "frac_reward_zero_std": 1.0, "grad_norm": 0.01990884618027876, "kl": 0.00041961669921875, "learning_rate": 9.999432563531187e-07, "loss": 0.0, "num_tokens": 1398208.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 314.125, "completions/mean_terminated_length": 314.125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.10517864311521477, "frac_reward_zero_std": 1.0, "grad_norm": 0.009932342363929612, "kl": 0.00022029876708984375, "learning_rate": 9.999363844470767e-07, "loss": 0.0, "num_tokens": 1401845.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.105446273250368, "frac_reward_zero_std": 1.0, "grad_norm": 0.024825630415438418, "kl": 0.00030517578125, "learning_rate": 9.999291198987272e-07, "loss": 0.0, "num_tokens": 1405147.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 346.25, "completions/mean_terminated_length": 346.25, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.10571390338552121, "frac_reward_zero_std": 1.0, "grad_norm": 0.006522896101774946, "kl": 0.0001494884490966797, "learning_rate": 9.999214627144095e-07, "loss": 0.0, "num_tokens": 1409073.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.10598153352067442, "frac_reward_zero_std": 0.5, "grad_norm": 0.991040190020681, "kl": 0.00030422210693359375, "learning_rate": 9.99913412900806e-07, "loss": 0.0236, "num_tokens": 1413103.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.10624916365582765, "frac_reward_zero_std": 1.0, "grad_norm": 0.010991785977195313, "kl": 0.00018787384033203125, "learning_rate": 9.999049704649414e-07, "loss": 0.0, "num_tokens": 1416197.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.10651679379098086, "frac_reward_zero_std": 1.0, "grad_norm": 0.0055730300947604205, "kl": 0.00016689300537109375, "learning_rate": 9.998961354141834e-07, "loss": 0.0, "num_tokens": 1419607.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.10678442392613409, "frac_reward_zero_std": 1.0, "grad_norm": 0.007935711613325548, "kl": 0.00010919570922851562, "learning_rate": 9.998869077562414e-07, "loss": 0.0, "num_tokens": 1422361.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.1070520540612873, "frac_reward_zero_std": 1.0, "grad_norm": 0.00893080424821541, "kl": 0.00018072128295898438, "learning_rate": 9.998772874991684e-07, "loss": 0.0, "num_tokens": 1425676.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 209.25, "completions/mean_terminated_length": 209.25, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.10731968419644052, "frac_reward_zero_std": 1.0, "grad_norm": 0.004621665803496376, "kl": 5.936622619628906e-05, "learning_rate": 9.998672746513597e-07, "loss": 0.0, "num_tokens": 1428278.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.10758731433159374, "frac_reward_zero_std": 1.0, "grad_norm": 0.00476905660987452, "kl": 0.00016355514526367188, "learning_rate": 9.99856869221553e-07, "loss": 0.0, "num_tokens": 1431663.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.10785494446674695, "frac_reward_zero_std": 1.0, "grad_norm": 0.005330564272353815, "kl": 0.00015306472778320312, "learning_rate": 9.998460712188285e-07, "loss": 0.0, "num_tokens": 1434907.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 332.375, "completions/mean_terminated_length": 332.375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.10812257460190017, "frac_reward_zero_std": 1.0, "grad_norm": 0.00516212812688921, "kl": 0.00012803077697753906, "learning_rate": 9.998348806526097e-07, "loss": 0.0, "num_tokens": 1438630.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1083902047370534, "frac_reward_zero_std": 1.0, "grad_norm": 0.02693183845955497, "kl": 0.0002460479736328125, "learning_rate": 9.998232975326616e-07, "loss": 0.0, "num_tokens": 1441516.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.10865783487220661, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061643275157566456, "kl": 0.00022411346435546875, "learning_rate": 9.99811321869093e-07, "loss": 0.0, "num_tokens": 1444940.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.10892546500735983, "frac_reward_zero_std": 0.5, "grad_norm": 0.8936753871921082, "kl": 0.0003871917724609375, "learning_rate": 9.99798953672354e-07, "loss": -0.0383, "num_tokens": 1448779.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 410.25, "completions/mean_terminated_length": 410.25, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.10919309514251305, "frac_reward_zero_std": 0.5, "grad_norm": 0.63857462410155, "kl": 0.00020694732666015625, "learning_rate": 9.997861929532382e-07, "loss": -0.0434, "num_tokens": 1453093.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.10946072527766626, "frac_reward_zero_std": 1.0, "grad_norm": 0.01076667799682243, "kl": 0.00025463104248046875, "learning_rate": 9.997730397228815e-07, "loss": 0.0, "num_tokens": 1457074.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.10972835541281949, "frac_reward_zero_std": 1.0, "grad_norm": 0.018973758698209886, "kl": 0.00028133392333984375, "learning_rate": 9.997594939927616e-07, "loss": 0.0, "num_tokens": 1459659.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.1099959855479727, "frac_reward_zero_std": 1.0, "grad_norm": 0.005503275533104705, "kl": 0.00015687942504882812, "learning_rate": 9.997455557747e-07, "loss": 0.0, "num_tokens": 1462503.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 330.75, "completions/mean_terminated_length": 330.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.11026361568312593, "frac_reward_zero_std": 1.0, "grad_norm": 0.03436063856309171, "kl": 0.0003552436828613281, "learning_rate": 9.9973122508086e-07, "loss": 0.0, "num_tokens": 1466425.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 203.875, "completions/mean_terminated_length": 203.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.11053124581827914, "frac_reward_zero_std": 0.5, "grad_norm": 1.534397494588389, "kl": 0.00034427642822265625, "learning_rate": 9.99716501923747e-07, "loss": 0.0338, "num_tokens": 1468968.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.11079887595343235, "frac_reward_zero_std": 1.0, "grad_norm": 0.01086133011380426, "kl": 0.0003452301025390625, "learning_rate": 9.997013863162098e-07, "loss": 0.0, "num_tokens": 1472596.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.11106650608858558, "frac_reward_zero_std": 1.0, "grad_norm": 0.04342753447747822, "kl": 0.0005207061767578125, "learning_rate": 9.996858782714389e-07, "loss": 0.0, "num_tokens": 1476260.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.11133413622373879, "frac_reward_zero_std": 1.0, "grad_norm": 0.013067432537827765, "kl": 0.0003509521484375, "learning_rate": 9.996699778029677e-07, "loss": 0.0, "num_tokens": 1480094.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 295.0, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.111601766358892, "frac_reward_zero_std": 1.0, "grad_norm": 0.017728151504955676, "kl": 0.00035762786865234375, "learning_rate": 9.99653684924672e-07, "loss": 0.0, "num_tokens": 1483462.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.11186939649404523, "frac_reward_zero_std": 1.0, "grad_norm": 0.010011029123438158, "kl": 0.00023317337036132812, "learning_rate": 9.9963699965077e-07, "loss": 0.0, "num_tokens": 1486602.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.11213702662919844, "frac_reward_zero_std": 1.0, "grad_norm": 0.015222297022505198, "kl": 0.00032806396484375, "learning_rate": 9.99619921995822e-07, "loss": 0.0, "num_tokens": 1489551.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 312.75, "completions/mean_terminated_length": 312.75, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.11240465676435167, "frac_reward_zero_std": 0.0, "grad_norm": 1.176087414633444, "kl": 0.0002503395080566406, "learning_rate": 9.996024519747312e-07, "loss": -0.0608, "num_tokens": 1493017.0, "reward": 0.375, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.11267228689950488, "frac_reward_zero_std": 1.0, "grad_norm": 0.021823952824227093, "kl": 0.0004825592041015625, "learning_rate": 9.99584589602743e-07, "loss": 0.0, "num_tokens": 1496085.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.1129399170346581, "frac_reward_zero_std": 1.0, "grad_norm": 0.009909153086887905, "kl": 0.000400543212890625, "learning_rate": 9.995663348954455e-07, "loss": 0.0, "num_tokens": 1499514.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 311.25, "completions/mean_terminated_length": 311.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.11320754716981132, "frac_reward_zero_std": 1.0, "grad_norm": 0.015154436533245406, "kl": 0.00046539306640625, "learning_rate": 9.995476878687684e-07, "loss": 0.0, "num_tokens": 1503080.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 260.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.11347517730496454, "frac_reward_zero_std": 1.0, "grad_norm": 0.026901391706705347, "kl": 0.000732421875, "learning_rate": 9.995286485389847e-07, "loss": 0.0, "num_tokens": 1506218.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 366.375, "completions/mean_terminated_length": 366.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.11374280744011776, "frac_reward_zero_std": 1.0, "grad_norm": 0.01429195161391434, "kl": 0.0004978179931640625, "learning_rate": 9.995092169227089e-07, "loss": 0.0, "num_tokens": 1510153.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 422.75, "completions/mean_terminated_length": 422.75, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.11401043757527098, "frac_reward_zero_std": 1.0, "grad_norm": 0.006462990911216886, "kl": 0.00028228759765625, "learning_rate": 9.994893930368985e-07, "loss": 0.0, "num_tokens": 1514731.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.11427806771042419, "frac_reward_zero_std": 1.0, "grad_norm": 0.07169588247846094, "kl": 0.0019207000732421875, "learning_rate": 9.99469176898853e-07, "loss": 0.0001, "num_tokens": 1518650.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 318.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.11454569784557742, "frac_reward_zero_std": 0.5, "grad_norm": 0.7947835511954863, "kl": 0.00038814544677734375, "learning_rate": 9.994485685262143e-07, "loss": 0.014, "num_tokens": 1522250.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.11481332798073063, "frac_reward_zero_std": 1.0, "grad_norm": 0.016997442132569585, "kl": 0.00042438507080078125, "learning_rate": 9.994275679369663e-07, "loss": 0.0, "num_tokens": 1525620.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.11508095811588386, "frac_reward_zero_std": 1.0, "grad_norm": 0.005231521630169851, "kl": 0.00019741058349609375, "learning_rate": 9.994061751494354e-07, "loss": 0.0, "num_tokens": 1528837.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 332.375, "completions/mean_terminated_length": 332.375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.11534858825103707, "frac_reward_zero_std": 1.0, "grad_norm": 0.008764956582801658, "kl": 0.00023746490478515625, "learning_rate": 9.993843901822908e-07, "loss": 0.0, "num_tokens": 1532560.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 234.0, "completions/mean_terminated_length": 234.0, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.11561621838619028, "frac_reward_zero_std": 1.0, "grad_norm": 0.009138807406725862, "kl": 0.0002765655517578125, "learning_rate": 9.993622130545427e-07, "loss": 0.0, "num_tokens": 1535472.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.11588384852134351, "frac_reward_zero_std": 1.0, "grad_norm": 0.005778160566669272, "kl": 0.00022077560424804688, "learning_rate": 9.99339643785545e-07, "loss": 0.0, "num_tokens": 1539080.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.11615147865649672, "frac_reward_zero_std": 0.5, "grad_norm": 0.8153263557778346, "kl": 0.00042629241943359375, "learning_rate": 9.993166823949923e-07, "loss": 0.0091, "num_tokens": 1542608.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 325.75, "completions/mean_terminated_length": 325.75, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.11641910879164993, "frac_reward_zero_std": 1.0, "grad_norm": 0.01983196230194631, "kl": 0.0007038116455078125, "learning_rate": 9.992933289029224e-07, "loss": 0.0, "num_tokens": 1546370.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 481.125, "completions/mean_terminated_length": 481.125, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.11668673892680316, "frac_reward_zero_std": 1.0, "grad_norm": 0.06511380810827792, "kl": 0.0017766952514648438, "learning_rate": 9.992695833297154e-07, "loss": 0.0001, "num_tokens": 1551543.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.11695436906195637, "frac_reward_zero_std": 1.0, "grad_norm": 0.05818219063194694, "kl": 0.001392364501953125, "learning_rate": 9.992454456960924e-07, "loss": 0.0001, "num_tokens": 1555271.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.1172219991971096, "frac_reward_zero_std": 1.0, "grad_norm": 0.02826700983016937, "kl": 0.000812530517578125, "learning_rate": 9.99220916023118e-07, "loss": 0.0, "num_tokens": 1559360.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.11748962933226281, "frac_reward_zero_std": 1.0, "grad_norm": 0.005484006254637226, "kl": 0.0001780986785888672, "learning_rate": 9.99195994332198e-07, "loss": 0.0, "num_tokens": 1562597.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.11775725946741603, "frac_reward_zero_std": 1.0, "grad_norm": 0.008665176874565991, "kl": 0.000316619873046875, "learning_rate": 9.991706806450809e-07, "loss": 0.0, "num_tokens": 1566136.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.11802488960256925, "frac_reward_zero_std": 1.0, "grad_norm": 0.00683889031947812, "kl": 0.00019693374633789062, "learning_rate": 9.991449749838567e-07, "loss": 0.0, "num_tokens": 1569220.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 297.125, "completions/mean_terminated_length": 297.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.11829251973772247, "frac_reward_zero_std": 0.5, "grad_norm": 0.9138953362859219, "kl": 0.000823974609375, "learning_rate": 9.991188773709577e-07, "loss": 0.0446, "num_tokens": 1572701.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 233.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.11856014987287569, "frac_reward_zero_std": 1.0, "grad_norm": 0.004199504849368019, "kl": 8.916854858398438e-05, "learning_rate": 9.990923878291586e-07, "loss": 0.0, "num_tokens": 1575462.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 293.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.1188277800080289, "frac_reward_zero_std": 1.0, "grad_norm": 0.013100508983848725, "kl": 0.0003986358642578125, "learning_rate": 9.990655063815755e-07, "loss": 0.0, "num_tokens": 1578891.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 355.5, "completions/mean_terminated_length": 355.5, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.11909541014318212, "frac_reward_zero_std": 1.0, "grad_norm": 0.04887058896734658, "kl": 0.0016374588012695312, "learning_rate": 9.990382330516673e-07, "loss": 0.0001, "num_tokens": 1582743.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.11936304027833534, "frac_reward_zero_std": 1.0, "grad_norm": 0.028855786427012513, "kl": 0.000896453857421875, "learning_rate": 9.990105678632337e-07, "loss": 0.0, "num_tokens": 1586388.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 302.75, "completions/mean_terminated_length": 302.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.11963067041348856, "frac_reward_zero_std": 1.0, "grad_norm": 0.008561256221548665, "kl": 0.00026988983154296875, "learning_rate": 9.989825108404176e-07, "loss": 0.0, "num_tokens": 1589714.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.11989830054864177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0287145388579542, "kl": 0.0004673004150390625, "learning_rate": 9.98954062007703e-07, "loss": 0.0, "num_tokens": 1593210.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 369.75, "completions/mean_terminated_length": 369.75, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.120165930683795, "frac_reward_zero_std": 0.5, "grad_norm": 0.8200083816732087, "kl": 0.00036334991455078125, "learning_rate": 9.989252213899164e-07, "loss": -0.0077, "num_tokens": 1597316.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.12043356081894821, "frac_reward_zero_std": 0.5, "grad_norm": 1.1156433600437217, "kl": 0.0006256103515625, "learning_rate": 9.988959890122255e-07, "loss": -0.0078, "num_tokens": 1601178.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.12070119095410144, "frac_reward_zero_std": 1.0, "grad_norm": 0.01329305494477296, "kl": 0.00046634674072265625, "learning_rate": 9.988663649001407e-07, "loss": 0.0, "num_tokens": 1604432.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 332.5, "completions/mean_terminated_length": 332.5, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.12096882108925465, "frac_reward_zero_std": 1.0, "grad_norm": 0.07884889972176597, "kl": 0.00142669677734375, "learning_rate": 9.988363490795134e-07, "loss": 0.0001, "num_tokens": 1608268.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 306.125, "completions/mean_terminated_length": 306.125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.12123645122440786, "frac_reward_zero_std": 1.0, "grad_norm": 0.011276850048707687, "kl": 0.0005750656127929688, "learning_rate": 9.988059415765376e-07, "loss": 0.0, "num_tokens": 1612161.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 312.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.12150408135956109, "frac_reward_zero_std": 1.0, "grad_norm": 0.06400495676208345, "kl": 0.001373291015625, "learning_rate": 9.987751424177487e-07, "loss": 0.0001, "num_tokens": 1615856.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 293.75, "completions/mean_terminated_length": 293.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.1217717114947143, "frac_reward_zero_std": 1.0, "grad_norm": 0.008560399814646442, "kl": 0.00034809112548828125, "learning_rate": 9.987439516300241e-07, "loss": 0.0, "num_tokens": 1619310.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.12203934162986753, "frac_reward_zero_std": 1.0, "grad_norm": 0.08883693157436372, "kl": 0.00286865234375, "learning_rate": 9.987123692405823e-07, "loss": 0.0001, "num_tokens": 1622562.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 314.125, "completions/mean_terminated_length": 314.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.12230697176502074, "frac_reward_zero_std": 1.0, "grad_norm": 0.017617107934114386, "kl": 0.0007152557373046875, "learning_rate": 9.986803952769844e-07, "loss": 0.0, "num_tokens": 1626175.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.12257460190017395, "frac_reward_zero_std": 1.0, "grad_norm": 0.010924725858592723, "kl": 0.000431060791015625, "learning_rate": 9.98648029767133e-07, "loss": 0.0, "num_tokens": 1629867.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.12284223203532718, "frac_reward_zero_std": 1.0, "grad_norm": 0.04257564348731927, "kl": 0.0007762908935546875, "learning_rate": 9.986152727392721e-07, "loss": 0.0, "num_tokens": 1633101.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.1231098621704804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0933617271674593, "kl": 0.00140380859375, "learning_rate": 9.985821242219874e-07, "loss": 0.0001, "num_tokens": 1636043.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.12337749230563362, "frac_reward_zero_std": 0.5, "grad_norm": 0.9176914447198431, "kl": 0.0003509521484375, "learning_rate": 9.985485842442063e-07, "loss": 0.0036, "num_tokens": 1639476.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.12364512244078683, "frac_reward_zero_std": 1.0, "grad_norm": 0.029891967533381088, "kl": 0.0006866455078125, "learning_rate": 9.98514652835198e-07, "loss": 0.0, "num_tokens": 1642633.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.12391275257594005, "frac_reward_zero_std": 1.0, "grad_norm": 0.26066195346479765, "kl": 0.002864837646484375, "learning_rate": 9.984803300245731e-07, "loss": 0.0001, "num_tokens": 1645928.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 203.25, "completions/mean_terminated_length": 203.25, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.12418038271109327, "frac_reward_zero_std": 1.0, "grad_norm": 0.011848621545451746, "kl": 0.0003261566162109375, "learning_rate": 9.98445615842284e-07, "loss": 0.0, "num_tokens": 1648598.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.12444801284624649, "frac_reward_zero_std": 0.5, "grad_norm": 0.6240561334092712, "kl": 0.0003204345703125, "learning_rate": 9.984105103186244e-07, "loss": -0.0258, "num_tokens": 1652397.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.1247156429813997, "frac_reward_zero_std": 1.0, "grad_norm": 0.008480895691678777, "kl": 0.00024890899658203125, "learning_rate": 9.983750134842292e-07, "loss": 0.0, "num_tokens": 1655745.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.12498327311655293, "frac_reward_zero_std": 1.0, "grad_norm": 0.014230156674706052, "kl": 0.0003643035888671875, "learning_rate": 9.983391253700755e-07, "loss": 0.0, "num_tokens": 1658512.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 339.5, "completions/mean_terminated_length": 339.5, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.12525090325170615, "frac_reward_zero_std": 1.0, "grad_norm": 0.005073941158735102, "kl": 0.00015497207641601562, "learning_rate": 9.98302846007481e-07, "loss": 0.0, "num_tokens": 1662304.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.12551853338685937, "frac_reward_zero_std": 1.0, "grad_norm": 0.010316261383639388, "kl": 0.00037670135498046875, "learning_rate": 9.98266175428106e-07, "loss": 0.0, "num_tokens": 1665806.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.12578616352201258, "frac_reward_zero_std": 1.0, "grad_norm": 0.0065926463559186545, "kl": 0.000217437744140625, "learning_rate": 9.98229113663951e-07, "loss": 0.0, "num_tokens": 1668889.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1260537936571658, "frac_reward_zero_std": 0.5, "grad_norm": 0.7315891083116233, "kl": 0.0003528594970703125, "learning_rate": 9.981916607473586e-07, "loss": 0.0, "num_tokens": 1672472.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 324.0, "completions/mean_terminated_length": 324.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.126321423792319, "frac_reward_zero_std": 1.0, "grad_norm": 0.013089401836158475, "kl": 0.00038051605224609375, "learning_rate": 9.981538167110126e-07, "loss": 0.0, "num_tokens": 1676568.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 359.875, "completions/mean_terminated_length": 359.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.12658905392747224, "frac_reward_zero_std": 1.0, "grad_norm": 0.009274828261087852, "kl": 0.0002799034118652344, "learning_rate": 9.98115581587938e-07, "loss": 0.0, "num_tokens": 1680759.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.12685668406262546, "frac_reward_zero_std": 1.0, "grad_norm": 0.05941937807742805, "kl": 0.00093841552734375, "learning_rate": 9.980769554115009e-07, "loss": 0.0, "num_tokens": 1684037.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.12712431419777867, "frac_reward_zero_std": 1.0, "grad_norm": 0.0066500707011458325, "kl": 0.0002288818359375, "learning_rate": 9.980379382154092e-07, "loss": 0.0, "num_tokens": 1687789.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 361.5, "completions/mean_terminated_length": 361.5, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.12739194433293188, "frac_reward_zero_std": 1.0, "grad_norm": 0.0336071589790411, "kl": 0.0005025863647460938, "learning_rate": 9.979985300337115e-07, "loss": 0.0, "num_tokens": 1691721.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.1276595744680851, "frac_reward_zero_std": 1.0, "grad_norm": 0.008285094613847055, "kl": 0.00040912628173828125, "learning_rate": 9.979587309007978e-07, "loss": 0.0, "num_tokens": 1695311.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 333.875, "completions/mean_terminated_length": 333.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.12792720460323834, "frac_reward_zero_std": 1.0, "grad_norm": 0.036461099717968626, "kl": 0.0003829002380371094, "learning_rate": 9.979185408513995e-07, "loss": 0.0, "num_tokens": 1699262.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 349.875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.12819483473839155, "frac_reward_zero_std": 1.0, "grad_norm": 0.02274708799196191, "kl": 0.000476837158203125, "learning_rate": 9.978779599205889e-07, "loss": 0.0, "num_tokens": 1703145.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 346.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.12846246487354476, "frac_reward_zero_std": 1.0, "grad_norm": 0.03700616437993258, "kl": 0.0006251335144042969, "learning_rate": 9.978369881437789e-07, "loss": 0.0, "num_tokens": 1707073.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 415.375, "completions/mean_terminated_length": 415.375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.12873009500869798, "frac_reward_zero_std": 0.0, "grad_norm": 1.6341430792017966, "kl": 0.00028705596923828125, "learning_rate": 9.97795625556725e-07, "loss": -0.0511, "num_tokens": 1711524.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 367.0, "completions/mean_terminated_length": 367.0, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.1289977251438512, "frac_reward_zero_std": 1.0, "grad_norm": 0.006366719565804852, "kl": 0.00022935867309570312, "learning_rate": 9.977538721955218e-07, "loss": 0.0, "num_tokens": 1715456.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.12926535527900443, "frac_reward_zero_std": 1.0, "grad_norm": 0.007329145314938225, "kl": 0.00023603439331054688, "learning_rate": 9.977117280966064e-07, "loss": 0.0, "num_tokens": 1719126.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.12953298541415764, "frac_reward_zero_std": 0.5, "grad_norm": 0.7686575119795547, "kl": 0.00024318695068359375, "learning_rate": 9.97669193296756e-07, "loss": 0.06, "num_tokens": 1722666.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 420.5, "completions/mean_terminated_length": 334.2857360839844, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.12980061554931085, "frac_reward_zero_std": 0.5, "grad_norm": 0.4949467433232356, "kl": 0.00012201070785522461, "learning_rate": 9.976262678330894e-07, "loss": 0.1602, "num_tokens": 1726994.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.13006824568446407, "frac_reward_zero_std": 0.5, "grad_norm": 1.1612145503371445, "kl": 0.000690460205078125, "learning_rate": 9.97582951743066e-07, "loss": 0.0734, "num_tokens": 1730849.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 336.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.13033587581961728, "frac_reward_zero_std": 0.5, "grad_norm": 1.253409920643393, "kl": 0.0006833076477050781, "learning_rate": 9.975392450644862e-07, "loss": -0.0521, "num_tokens": 1734601.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.1306035059547705, "frac_reward_zero_std": 1.0, "grad_norm": 0.05246502758860608, "kl": 0.002532958984375, "learning_rate": 9.974951478354907e-07, "loss": 0.0001, "num_tokens": 1737790.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 418.0, "completions/mean_terminated_length": 418.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.13087113608992373, "frac_reward_zero_std": 1.0, "grad_norm": 0.014433788808548042, "kl": 0.0010051727294921875, "learning_rate": 9.974506600945616e-07, "loss": 0.0, "num_tokens": 1742498.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 338.375, "completions/mean_terminated_length": 338.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.13113876622507695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0736226918625407, "kl": 0.0032863616943359375, "learning_rate": 9.97405781880522e-07, "loss": 0.0001, "num_tokens": 1746213.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 422.25, "completions/mean_terminated_length": 422.25, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.13140639636023016, "frac_reward_zero_std": 1.0, "grad_norm": 0.07645240838813396, "kl": 0.003173828125, "learning_rate": 9.97360513232535e-07, "loss": 0.0001, "num_tokens": 1750627.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 283.125, "completions/mean_terminated_length": 283.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.13167402649538337, "frac_reward_zero_std": 1.0, "grad_norm": 0.011976688679944931, "kl": 0.00072479248046875, "learning_rate": 9.973148541901052e-07, "loss": 0.0, "num_tokens": 1753916.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.13194165663053659, "frac_reward_zero_std": 1.0, "grad_norm": 0.061193875774643164, "kl": 0.007904052734375, "learning_rate": 9.972688047930773e-07, "loss": 0.0003, "num_tokens": 1756782.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.13220928676568983, "frac_reward_zero_std": 1.0, "grad_norm": 0.05566843015850411, "kl": 0.004608154296875, "learning_rate": 9.972223650816367e-07, "loss": 0.0002, "num_tokens": 1760648.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.13247691690084304, "frac_reward_zero_std": 1.0, "grad_norm": 0.021578389966219292, "kl": 0.00246429443359375, "learning_rate": 9.971755350963098e-07, "loss": 0.0001, "num_tokens": 1763630.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.13274454703599625, "frac_reward_zero_std": 1.0, "grad_norm": 0.01659531068166165, "kl": 0.00418853759765625, "learning_rate": 9.971283148779632e-07, "loss": 0.0002, "num_tokens": 1766317.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.13301217717114946, "frac_reward_zero_std": 1.0, "grad_norm": 1.1716130225438117, "kl": 0.02570343017578125, "learning_rate": 9.970807044678043e-07, "loss": 0.001, "num_tokens": 1769826.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.13327980730630268, "frac_reward_zero_std": 0.5, "grad_norm": 0.5725880023490283, "kl": 0.00209808349609375, "learning_rate": 9.97032703907381e-07, "loss": 0.0047, "num_tokens": 1773427.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.13354743744145592, "frac_reward_zero_std": 1.0, "grad_norm": 0.03835903691211433, "kl": 0.00409698486328125, "learning_rate": 9.96984313238581e-07, "loss": 0.0002, "num_tokens": 1777039.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.13381506757660913, "frac_reward_zero_std": 1.0, "grad_norm": 0.1967871363002955, "kl": 0.0124359130859375, "learning_rate": 9.969355325036336e-07, "loss": 0.0005, "num_tokens": 1780087.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.13408269771176234, "frac_reward_zero_std": 1.0, "grad_norm": 0.07203759213892778, "kl": 0.006103515625, "learning_rate": 9.968863617451077e-07, "loss": 0.0002, "num_tokens": 1783524.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 320.125, "completions/mean_terminated_length": 320.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.13435032784691556, "frac_reward_zero_std": 1.0, "grad_norm": 0.013881218239364819, "kl": 0.00067901611328125, "learning_rate": 9.968368010059126e-07, "loss": 0.0, "num_tokens": 1787041.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 327.875, "completions/mean_terminated_length": 327.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.13461795798206877, "frac_reward_zero_std": 1.0, "grad_norm": 0.3954690344858878, "kl": 0.020263671875, "learning_rate": 9.967868503292981e-07, "loss": 0.0008, "num_tokens": 1790780.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.134885588117222, "frac_reward_zero_std": 1.0, "grad_norm": 0.21089092347846683, "kl": 0.013519287109375, "learning_rate": 9.967365097588546e-07, "loss": 0.0005, "num_tokens": 1793941.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 290.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.13515321825237522, "frac_reward_zero_std": 1.0, "grad_norm": 0.009612968553165922, "kl": 0.00049591064453125, "learning_rate": 9.96685779338512e-07, "loss": 0.0, "num_tokens": 1797182.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 250.75, "completions/mean_terminated_length": 250.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.13542084838752844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0170638220630097, "kl": 0.0010833740234375, "learning_rate": 9.966346591125408e-07, "loss": 0.0, "num_tokens": 1800204.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 334.0, "completions/mean_terminated_length": 334.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.13568847852268165, "frac_reward_zero_std": 0.5, "grad_norm": 0.797605964219667, "kl": 0.0028228759765625, "learning_rate": 9.96583149125552e-07, "loss": 0.007, "num_tokens": 1803988.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.13595610865783486, "frac_reward_zero_std": 1.0, "grad_norm": 0.04914113576863653, "kl": 0.00418853759765625, "learning_rate": 9.96531249422496e-07, "loss": 0.0002, "num_tokens": 1806937.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1362237387929881, "frac_reward_zero_std": 1.0, "grad_norm": 0.010808539415594705, "kl": 0.00043964385986328125, "learning_rate": 9.96478960048664e-07, "loss": 0.0, "num_tokens": 1809910.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.13649136892814132, "frac_reward_zero_std": 1.0, "grad_norm": 0.04810494715635189, "kl": 0.0022125244140625, "learning_rate": 9.964262810496866e-07, "loss": 0.0001, "num_tokens": 1812878.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.13675899906329453, "frac_reward_zero_std": 1.0, "grad_norm": 0.052711870979104056, "kl": 0.001438140869140625, "learning_rate": 9.963732124715351e-07, "loss": 0.0001, "num_tokens": 1816032.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.13702662919844774, "frac_reward_zero_std": 1.0, "grad_norm": 0.015288434216030912, "kl": 0.000629425048828125, "learning_rate": 9.963197543605205e-07, "loss": 0.0, "num_tokens": 1819788.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.13729425933360095, "frac_reward_zero_std": 1.0, "grad_norm": 0.01805010692717675, "kl": 0.000881195068359375, "learning_rate": 9.962659067632932e-07, "loss": 0.0, "num_tokens": 1822806.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 220.625, "completions/mean_terminated_length": 220.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.1375618894687542, "frac_reward_zero_std": 1.0, "grad_norm": 0.011561076085573344, "kl": 0.000652313232421875, "learning_rate": 9.96211669726844e-07, "loss": 0.0, "num_tokens": 1825459.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 229.25, "completions/mean_terminated_length": 229.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.1378295196039074, "frac_reward_zero_std": 1.0, "grad_norm": 0.009409341296505528, "kl": 0.000446319580078125, "learning_rate": 9.96157043298504e-07, "loss": 0.0, "num_tokens": 1828253.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 326.125, "completions/mean_terminated_length": 326.125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.13809714973906062, "frac_reward_zero_std": 1.0, "grad_norm": 0.3504101551785696, "kl": 0.00444793701171875, "learning_rate": 9.96102027525943e-07, "loss": 0.0002, "num_tokens": 1831886.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 333.875, "completions/mean_terminated_length": 333.875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.13836477987421383, "frac_reward_zero_std": 1.0, "grad_norm": 0.025402032051111786, "kl": 0.0010128021240234375, "learning_rate": 9.960466224571718e-07, "loss": 0.0, "num_tokens": 1835665.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 376.375, "completions/mean_terminated_length": 376.375, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.13863241000936705, "frac_reward_zero_std": 0.5, "grad_norm": 0.6122943886066132, "kl": 0.000518798828125, "learning_rate": 9.959908281405398e-07, "loss": 0.0691, "num_tokens": 1839692.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.13890004014452026, "frac_reward_zero_std": 1.0, "grad_norm": 0.01502983173973432, "kl": 0.00027370452880859375, "learning_rate": 9.959346446247365e-07, "loss": 0.0, "num_tokens": 1842955.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 228.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.1391676702796735, "frac_reward_zero_std": 1.0, "grad_norm": 0.017933414407763814, "kl": 0.0005235671997070312, "learning_rate": 9.958780719587916e-07, "loss": 0.0, "num_tokens": 1845738.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.1394353004148267, "frac_reward_zero_std": 0.5, "grad_norm": 1.641176832515681, "kl": 0.0007781982421875, "learning_rate": 9.958211101920732e-07, "loss": -0.0004, "num_tokens": 1849242.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 329.25, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.13970293054997993, "frac_reward_zero_std": 1.0, "grad_norm": 0.029985698781650784, "kl": 0.0008192062377929688, "learning_rate": 9.957637593742905e-07, "loss": 0.0, "num_tokens": 1852968.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.13997056068513314, "frac_reward_zero_std": 1.0, "grad_norm": 0.00968038203473445, "kl": 0.000347137451171875, "learning_rate": 9.957060195554907e-07, "loss": 0.0, "num_tokens": 1855734.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.14023819082028635, "frac_reward_zero_std": 0.5, "grad_norm": 0.7209618334843481, "kl": 0.0004901885986328125, "learning_rate": 9.956478907860613e-07, "loss": -0.0196, "num_tokens": 1859164.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.1405058209554396, "frac_reward_zero_std": 1.0, "grad_norm": 0.00885351189490293, "kl": 0.000370025634765625, "learning_rate": 9.955893731167293e-07, "loss": 0.0, "num_tokens": 1862282.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.1407734510905928, "frac_reward_zero_std": 0.5, "grad_norm": 1.042987533564096, "kl": 0.000457763671875, "learning_rate": 9.95530466598561e-07, "loss": -0.0065, "num_tokens": 1865439.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.14104108122574602, "frac_reward_zero_std": 1.0, "grad_norm": 0.01059290429935015, "kl": 0.00044536590576171875, "learning_rate": 9.95471171282961e-07, "loss": 0.0, "num_tokens": 1868804.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 381.875, "completions/mean_terminated_length": 381.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.14130871136089923, "frac_reward_zero_std": 1.0, "grad_norm": 0.017123289203560304, "kl": 0.0005807876586914062, "learning_rate": 9.954114872216748e-07, "loss": 0.0, "num_tokens": 1872923.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 332.25, "completions/mean_terminated_length": 332.25, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.14157634149605244, "frac_reward_zero_std": 1.0, "grad_norm": 0.019624913769792827, "kl": 0.000774383544921875, "learning_rate": 9.95351414466786e-07, "loss": 0.0, "num_tokens": 1876673.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.14184397163120568, "frac_reward_zero_std": 1.0, "grad_norm": 0.031827042866104244, "kl": 0.00074005126953125, "learning_rate": 9.952909530707185e-07, "loss": 0.0, "num_tokens": 1880254.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.1421116017663589, "frac_reward_zero_std": 1.0, "grad_norm": 0.033896342102465477, "kl": 0.00168609619140625, "learning_rate": 9.952301030862337e-07, "loss": 0.0001, "num_tokens": 1884186.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 348.0, "completions/mean_terminated_length": 348.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.1423792319015121, "frac_reward_zero_std": 1.0, "grad_norm": 0.029858065060618406, "kl": 0.001392364501953125, "learning_rate": 9.951688645664337e-07, "loss": 0.0001, "num_tokens": 1888130.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 226.875, "completions/mean_terminated_length": 226.875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.14264686203666532, "frac_reward_zero_std": 1.0, "grad_norm": 0.03908844971754174, "kl": 0.002445697784423828, "learning_rate": 9.951072375647587e-07, "loss": 0.0001, "num_tokens": 1890857.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.14291449217181854, "frac_reward_zero_std": 1.0, "grad_norm": 0.053886813173595824, "kl": 0.00180816650390625, "learning_rate": 9.950452221349886e-07, "loss": 0.0001, "num_tokens": 1893782.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 338.75, "completions/mean_terminated_length": 338.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.14318212230697178, "frac_reward_zero_std": 1.0, "grad_norm": 0.014824850456647264, "kl": 0.000518798828125, "learning_rate": 9.949828183312415e-07, "loss": 0.0, "num_tokens": 1897660.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.143449752442125, "frac_reward_zero_std": 1.0, "grad_norm": 0.031328473857934434, "kl": 0.0016326904296875, "learning_rate": 9.94920026207975e-07, "loss": 0.0001, "num_tokens": 1900677.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.1437173825772782, "frac_reward_zero_std": 0.5, "grad_norm": 0.6226341406291016, "kl": 0.000446319580078125, "learning_rate": 9.948568458199853e-07, "loss": -0.0248, "num_tokens": 1904093.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.14398501271243141, "frac_reward_zero_std": 1.0, "grad_norm": 0.052751519491162004, "kl": 0.0015087127685546875, "learning_rate": 9.947932772224076e-07, "loss": 0.0001, "num_tokens": 1907008.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.14425264284758463, "frac_reward_zero_std": 1.0, "grad_norm": 0.04177952775744063, "kl": 0.001728057861328125, "learning_rate": 9.94729320470716e-07, "loss": 0.0001, "num_tokens": 1909893.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 263.625, "completions/mean_terminated_length": 263.625, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.14452027298273787, "frac_reward_zero_std": 1.0, "grad_norm": 0.01635527017715889, "kl": 0.0007610321044921875, "learning_rate": 9.946649756207228e-07, "loss": 0.0, "num_tokens": 1912974.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 400.0, "completions/mean_terminated_length": 400.0, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.14478790311789108, "frac_reward_zero_std": 1.0, "grad_norm": 0.011156159595031627, "kl": 0.0007076263427734375, "learning_rate": 9.946002427285792e-07, "loss": 0.0, "num_tokens": 1917198.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.1450555332530443, "frac_reward_zero_std": 1.0, "grad_norm": 0.050467417247429355, "kl": 0.001331329345703125, "learning_rate": 9.945351218507753e-07, "loss": 0.0001, "num_tokens": 1920185.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 242.625, "completions/mean_terminated_length": 242.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.1453231633881975, "frac_reward_zero_std": 1.0, "grad_norm": 0.00824413487155551, "kl": 0.0002658367156982422, "learning_rate": 9.944696130441397e-07, "loss": 0.0, "num_tokens": 1923174.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.14559079352335072, "frac_reward_zero_std": 0.5, "grad_norm": 1.0259571161635082, "kl": 0.0006427764892578125, "learning_rate": 9.944037163658396e-07, "loss": 0.0624, "num_tokens": 1927051.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 371.5, "completions/mean_terminated_length": 371.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.14585842365850396, "frac_reward_zero_std": 0.5, "grad_norm": 0.6895397627451952, "kl": 0.00090789794921875, "learning_rate": 9.9433743187338e-07, "loss": 0.0, "num_tokens": 1931015.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 217.125, "completions/mean_terminated_length": 217.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.14612605379365717, "frac_reward_zero_std": 1.0, "grad_norm": 0.06037592573291269, "kl": 0.002155303955078125, "learning_rate": 9.942707596246051e-07, "loss": 0.0001, "num_tokens": 1933664.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.1463936839288104, "frac_reward_zero_std": 0.5, "grad_norm": 1.5387149228638608, "kl": 0.0005731582641601562, "learning_rate": 9.942036996776972e-07, "loss": 0.0008, "num_tokens": 1937368.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1466613140639636, "frac_reward_zero_std": 1.0, "grad_norm": 0.018433322639683707, "kl": 0.0007333755493164062, "learning_rate": 9.94136252091177e-07, "loss": 0.0, "num_tokens": 1940288.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 257.125, "completions/mean_terminated_length": 257.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.1469289441991168, "frac_reward_zero_std": 1.0, "grad_norm": 0.010414266826967713, "kl": 0.00042724609375, "learning_rate": 9.940684169239034e-07, "loss": 0.0, "num_tokens": 1943549.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 223.625, "completions/mean_terminated_length": 223.625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.14719657433427003, "frac_reward_zero_std": 1.0, "grad_norm": 0.03230451832945062, "kl": 0.0010204315185546875, "learning_rate": 9.940001942350737e-07, "loss": 0.0, "num_tokens": 1946254.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 365.375, "completions/mean_terminated_length": 365.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.14746420446942327, "frac_reward_zero_std": 1.0, "grad_norm": 0.046690847868426186, "kl": 0.0014362335205078125, "learning_rate": 9.939315840842232e-07, "loss": 0.0001, "num_tokens": 1950409.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 252.75, "completions/mean_terminated_length": 252.75, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.14773183460457648, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068481638107596166, "kl": 0.00029754638671875, "learning_rate": 9.93862586531225e-07, "loss": 0.0, "num_tokens": 1953375.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.1479994647397297, "frac_reward_zero_std": 0.5, "grad_norm": 0.8075068609766187, "kl": 0.0004553794860839844, "learning_rate": 9.93793201636291e-07, "loss": -0.0531, "num_tokens": 1957102.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.1482670948748829, "frac_reward_zero_std": 1.0, "grad_norm": 0.03213587249883756, "kl": 0.0006084442138671875, "learning_rate": 9.937234294599707e-07, "loss": 0.0, "num_tokens": 1960242.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.14853472501003612, "frac_reward_zero_std": 1.0, "grad_norm": 0.053790404092944034, "kl": 0.0007963180541992188, "learning_rate": 9.936532700631516e-07, "loss": 0.0, "num_tokens": 1963842.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 278.25, "completions/mean_terminated_length": 278.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.14880235514518936, "frac_reward_zero_std": 1.0, "grad_norm": 0.031000537758605166, "kl": 0.0011653900146484375, "learning_rate": 9.935827235070591e-07, "loss": 0.0, "num_tokens": 1967036.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.14906998528034257, "frac_reward_zero_std": 0.5, "grad_norm": 0.6634436741041001, "kl": 0.00037860870361328125, "learning_rate": 9.935117898532565e-07, "loss": 0.012, "num_tokens": 1970348.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 430.625, "completions/mean_terminated_length": 430.625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.14933761541549578, "frac_reward_zero_std": 0.5, "grad_norm": 0.8400805924233327, "kl": 0.00039005279541015625, "learning_rate": 9.93440469163645e-07, "loss": -0.0271, "num_tokens": 1975297.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.149605245550649, "frac_reward_zero_std": 1.0, "grad_norm": 0.015825677520335344, "kl": 0.0010242462158203125, "learning_rate": 9.93368761500463e-07, "loss": 0.0, "num_tokens": 1978255.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.1498728756858022, "frac_reward_zero_std": 1.0, "grad_norm": 0.010210551633150811, "kl": 0.00037860870361328125, "learning_rate": 9.932966669262876e-07, "loss": 0.0, "num_tokens": 1981279.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 318.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.15014050582095545, "frac_reward_zero_std": 0.5, "grad_norm": 0.7994707474917686, "kl": 0.0006866455078125, "learning_rate": 9.932241855040327e-07, "loss": -0.0047, "num_tokens": 1984911.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.15040813595610866, "frac_reward_zero_std": 1.0, "grad_norm": 0.09821596923480325, "kl": 0.0047607421875, "learning_rate": 9.9315131729695e-07, "loss": 0.0002, "num_tokens": 1988189.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.15067576609126188, "frac_reward_zero_std": 1.0, "grad_norm": 0.009404518344817748, "kl": 0.000446319580078125, "learning_rate": 9.93078062368629e-07, "loss": 0.0, "num_tokens": 1991242.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.1509433962264151, "frac_reward_zero_std": 1.0, "grad_norm": 0.01490422857206491, "kl": 0.0007190704345703125, "learning_rate": 9.930044207829964e-07, "loss": 0.0, "num_tokens": 1994544.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 235.625, "completions/mean_terminated_length": 235.625, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.1512110263615683, "frac_reward_zero_std": 1.0, "grad_norm": 0.016891037552495755, "kl": 0.0014781951904296875, "learning_rate": 9.929303926043167e-07, "loss": 0.0001, "num_tokens": 1997465.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 533.75, "completions/mean_terminated_length": 533.75, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.15147865649672154, "frac_reward_zero_std": 0.5, "grad_norm": 1.033656854611728, "kl": 0.001277923583984375, "learning_rate": 9.928559778971908e-07, "loss": 0.0409, "num_tokens": 2002995.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 279.375, "completions/mean_terminated_length": 279.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.15174628663187475, "frac_reward_zero_std": 1.0, "grad_norm": 0.010583221611918514, "kl": 0.00048732757568359375, "learning_rate": 9.927811767265581e-07, "loss": 0.0, "num_tokens": 2006234.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.15201391676702797, "frac_reward_zero_std": 1.0, "grad_norm": 0.013303277632361074, "kl": 0.00102996826171875, "learning_rate": 9.927059891576945e-07, "loss": 0.0, "num_tokens": 2009838.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 237.0, "completions/max_terminated_length": 237.0, "completions/mean_length": 193.75, "completions/mean_terminated_length": 193.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.15228154690218118, "frac_reward_zero_std": 1.0, "grad_norm": 0.037420775407790825, "kl": 0.00223541259765625, "learning_rate": 9.926304152562132e-07, "loss": 0.0001, "num_tokens": 2012456.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.1525491770373344, "frac_reward_zero_std": 0.0, "grad_norm": 1.708613829400758, "kl": 0.004230499267578125, "learning_rate": 9.92554455088065e-07, "loss": 0.0866, "num_tokens": 2016721.0, "reward": 0.5, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 195.75, "completions/mean_terminated_length": 195.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.15281680717248763, "frac_reward_zero_std": 1.0, "grad_norm": 0.05946246373268087, "kl": 0.007171630859375, "learning_rate": 9.924781087195374e-07, "loss": 0.0003, "num_tokens": 2019319.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.15308443730764085, "frac_reward_zero_std": 1.0, "grad_norm": 0.033317657443341725, "kl": 0.0019130706787109375, "learning_rate": 9.924013762172548e-07, "loss": 0.0001, "num_tokens": 2022419.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.15335206744279406, "frac_reward_zero_std": 1.0, "grad_norm": 0.049297057059867845, "kl": 0.0042724609375, "learning_rate": 9.923242576481786e-07, "loss": 0.0002, "num_tokens": 2025487.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.15361969757794727, "frac_reward_zero_std": 1.0, "grad_norm": 0.15033534942445878, "kl": 0.0069580078125, "learning_rate": 9.922467530796076e-07, "loss": 0.0003, "num_tokens": 2028615.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.15388732771310049, "frac_reward_zero_std": 1.0, "grad_norm": 0.02160771875381798, "kl": 0.001255035400390625, "learning_rate": 9.92168862579177e-07, "loss": 0.0001, "num_tokens": 2032029.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 848.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 411.375, "completions/mean_terminated_length": 411.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.15415495784825373, "frac_reward_zero_std": 0.5, "grad_norm": 0.595673537882265, "kl": 0.005218505859375, "learning_rate": 9.920905862148583e-07, "loss": -0.0199, "num_tokens": 2036684.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 345.625, "completions/mean_terminated_length": 345.625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.15442258798340694, "frac_reward_zero_std": 1.0, "grad_norm": 0.2636625690324213, "kl": 0.015899658203125, "learning_rate": 9.92011924054961e-07, "loss": 0.0006, "num_tokens": 2040597.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.15469021811856015, "frac_reward_zero_std": 1.0, "grad_norm": 0.03896202859915143, "kl": 0.00299835205078125, "learning_rate": 9.919328761681306e-07, "loss": 0.0001, "num_tokens": 2043574.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 259.0, "completions/max_terminated_length": 259.0, "completions/mean_length": 205.0, "completions/mean_terminated_length": 205.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.15495784825371337, "frac_reward_zero_std": 1.0, "grad_norm": 0.01991782599891687, "kl": 0.0021343231201171875, "learning_rate": 9.918534426233486e-07, "loss": 0.0001, "num_tokens": 2046274.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 384.125, "completions/mean_terminated_length": 384.125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.15522547838886658, "frac_reward_zero_std": 1.0, "grad_norm": 0.010284048006104557, "kl": 0.0004520416259765625, "learning_rate": 9.917736234899337e-07, "loss": 0.0, "num_tokens": 2050479.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 313.875, "completions/mean_terminated_length": 313.875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.1554931085240198, "frac_reward_zero_std": 1.0, "grad_norm": 0.03177917158360494, "kl": 0.0028362274169921875, "learning_rate": 9.916934188375415e-07, "loss": 0.0001, "num_tokens": 2053990.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.15576073865917303, "frac_reward_zero_std": 1.0, "grad_norm": 0.034692339326321014, "kl": 0.0059967041015625, "learning_rate": 9.916128287361632e-07, "loss": 0.0002, "num_tokens": 2056632.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.15602836879432624, "frac_reward_zero_std": 1.0, "grad_norm": 0.051755248537555074, "kl": 0.00417327880859375, "learning_rate": 9.915318532561269e-07, "loss": 0.0002, "num_tokens": 2060258.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 423.875, "completions/mean_terminated_length": 423.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.15629599892947946, "frac_reward_zero_std": 0.5, "grad_norm": 0.6592126337510127, "kl": 0.001087188720703125, "learning_rate": 9.914504924680964e-07, "loss": 0.0305, "num_tokens": 2064701.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 291.25, "completions/mean_terminated_length": 291.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.15656362906463267, "frac_reward_zero_std": 0.5, "grad_norm": 0.719345256405448, "kl": 0.00368499755859375, "learning_rate": 9.913687464430726e-07, "loss": -0.0062, "num_tokens": 2068283.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.15683125919978588, "frac_reward_zero_std": 1.0, "grad_norm": 0.011658492922905292, "kl": 0.001171112060546875, "learning_rate": 9.912866152523919e-07, "loss": 0.0, "num_tokens": 2072076.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.15709888933493912, "frac_reward_zero_std": 0.5, "grad_norm": 0.8665467918973613, "kl": 0.007293701171875, "learning_rate": 9.912040989677269e-07, "loss": 0.002, "num_tokens": 2076126.0, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.15736651947009234, "frac_reward_zero_std": 1.0, "grad_norm": 0.0315808158793898, "kl": 0.0033540725708007812, "learning_rate": 9.911211976610868e-07, "loss": 0.0001, "num_tokens": 2078962.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 224.0, "completions/max_terminated_length": 224.0, "completions/mean_length": 188.125, "completions/mean_terminated_length": 188.125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.15763414960524555, "frac_reward_zero_std": 1.0, "grad_norm": 0.019742737029032016, "kl": 0.00185394287109375, "learning_rate": 9.910379114048162e-07, "loss": 0.0001, "num_tokens": 2081451.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.15790177974039876, "frac_reward_zero_std": 1.0, "grad_norm": 0.018652025590720035, "kl": 0.0012187957763671875, "learning_rate": 9.90954240271596e-07, "loss": 0.0, "num_tokens": 2084987.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.15816940987555198, "frac_reward_zero_std": 1.0, "grad_norm": 0.006932247373952548, "kl": 0.0002684593200683594, "learning_rate": 9.908701843344427e-07, "loss": 0.0, "num_tokens": 2088658.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.15843704001070522, "frac_reward_zero_std": 0.5, "grad_norm": 1.0686723619801186, "kl": 0.009979248046875, "learning_rate": 9.907857436667085e-07, "loss": -0.044, "num_tokens": 2091601.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 347.75, "completions/mean_terminated_length": 347.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.15870467014585843, "frac_reward_zero_std": 0.5, "grad_norm": 1.03330670467835, "kl": 0.03668212890625, "learning_rate": 9.907009183420817e-07, "loss": 0.0301, "num_tokens": 2095475.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 324.375, "completions/mean_terminated_length": 324.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.15897230028101164, "frac_reward_zero_std": 1.0, "grad_norm": 0.013849417022406796, "kl": 0.0010986328125, "learning_rate": 9.906157084345865e-07, "loss": 0.0, "num_tokens": 2099106.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.15923993041616485, "frac_reward_zero_std": 1.0, "grad_norm": 0.015009008721142964, "kl": 0.00091552734375, "learning_rate": 9.905301140185816e-07, "loss": 0.0, "num_tokens": 2101930.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 207.875, "completions/mean_terminated_length": 207.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.15950756055131807, "frac_reward_zero_std": 1.0, "grad_norm": 0.25454555673468504, "kl": 0.016876220703125, "learning_rate": 9.904441351687624e-07, "loss": 0.0007, "num_tokens": 2104557.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 369.5, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.1597751906864713, "frac_reward_zero_std": 1.0, "grad_norm": 0.009951562001451019, "kl": 0.0005321502685546875, "learning_rate": 9.903577719601596e-07, "loss": 0.0, "num_tokens": 2108629.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 367.5, "completions/mean_terminated_length": 367.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.16004282082162452, "frac_reward_zero_std": 0.5, "grad_norm": 0.6344086019291509, "kl": 0.00247955322265625, "learning_rate": 9.902710244681386e-07, "loss": 0.0645, "num_tokens": 2112697.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.16031045095677773, "frac_reward_zero_std": 1.0, "grad_norm": 0.049109641587331256, "kl": 0.00283050537109375, "learning_rate": 9.901838927684008e-07, "loss": 0.0001, "num_tokens": 2116074.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.16057808109193095, "frac_reward_zero_std": 1.0, "grad_norm": 0.02854432971953057, "kl": 0.0032196044921875, "learning_rate": 9.900963769369826e-07, "loss": 0.0001, "num_tokens": 2119996.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.16084571122708416, "frac_reward_zero_std": 1.0, "grad_norm": 0.09734709589556297, "kl": 0.0062713623046875, "learning_rate": 9.900084770502561e-07, "loss": 0.0003, "num_tokens": 2122994.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1611133413622374, "frac_reward_zero_std": 1.0, "grad_norm": 0.05388181710089403, "kl": 0.00347900390625, "learning_rate": 9.89920193184928e-07, "loss": 0.0001, "num_tokens": 2126099.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.1613809714973906, "frac_reward_zero_std": 1.0, "grad_norm": 0.04369204921595306, "kl": 0.00384521484375, "learning_rate": 9.8983152541804e-07, "loss": 0.0002, "num_tokens": 2129276.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.16164860163254383, "frac_reward_zero_std": 1.0, "grad_norm": 0.04575397151550739, "kl": 0.002777099609375, "learning_rate": 9.897424738269693e-07, "loss": 0.0001, "num_tokens": 2132976.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.16191623176769704, "frac_reward_zero_std": 1.0, "grad_norm": 0.035336105295191225, "kl": 0.00119781494140625, "learning_rate": 9.896530384894276e-07, "loss": 0.0, "num_tokens": 2136317.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 381.5, "completions/mean_terminated_length": 381.5, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.16218386190285025, "frac_reward_zero_std": 0.5, "grad_norm": 1.1661951697393447, "kl": 0.000629425048828125, "learning_rate": 9.895632194834624e-07, "loss": -0.0511, "num_tokens": 2140333.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 404.875, "completions/mean_terminated_length": 404.875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.1624514920380035, "frac_reward_zero_std": 1.0, "grad_norm": 0.012859847282869813, "kl": 0.0006389617919921875, "learning_rate": 9.894730168874544e-07, "loss": 0.0, "num_tokens": 2144776.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.1627191221731567, "frac_reward_zero_std": 1.0, "grad_norm": 0.04672034248496116, "kl": 0.003086090087890625, "learning_rate": 9.893824307801205e-07, "loss": 0.0001, "num_tokens": 2147428.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.16298675230830992, "frac_reward_zero_std": 0.5, "grad_norm": 1.1087475629615768, "kl": 0.00118255615234375, "learning_rate": 9.892914612405117e-07, "loss": -0.0026, "num_tokens": 2151195.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 224.875, "completions/mean_terminated_length": 224.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.16325438244346313, "frac_reward_zero_std": 0.5, "grad_norm": 1.290387691098408, "kl": 0.00319671630859375, "learning_rate": 9.892001083480132e-07, "loss": -0.0188, "num_tokens": 2153930.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 355.75, "completions/mean_terminated_length": 355.75, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.16352201257861634, "frac_reward_zero_std": 1.0, "grad_norm": 0.12811234903893345, "kl": 0.00316619873046875, "learning_rate": 9.89108372182346e-07, "loss": 0.0001, "num_tokens": 2157948.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.16378964271376958, "frac_reward_zero_std": 1.0, "grad_norm": 0.04264958684737397, "kl": 0.001434326171875, "learning_rate": 9.89016252823564e-07, "loss": 0.0001, "num_tokens": 2161258.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 438.0, "completions/mean_terminated_length": 438.0, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.1640572728489228, "frac_reward_zero_std": 0.5, "grad_norm": 0.7854842097521374, "kl": 0.001216888427734375, "learning_rate": 9.889237503520567e-07, "loss": 0.0385, "num_tokens": 2166014.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.164324902984076, "frac_reward_zero_std": 1.0, "grad_norm": 0.03654346372538915, "kl": 0.002166748046875, "learning_rate": 9.888308648485473e-07, "loss": 0.0001, "num_tokens": 2169645.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.16459253311922922, "frac_reward_zero_std": 1.0, "grad_norm": 0.2452766806237618, "kl": 0.01068115234375, "learning_rate": 9.887375963940935e-07, "loss": 0.0004, "num_tokens": 2173120.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 482.125, "completions/mean_terminated_length": 482.125, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.16486016325438244, "frac_reward_zero_std": 1.0, "grad_norm": 0.06080709760155649, "kl": 0.0020751953125, "learning_rate": 9.88643945070087e-07, "loss": 0.0001, "num_tokens": 2178173.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.16512779338953565, "frac_reward_zero_std": 1.0, "grad_norm": 0.04203341429857676, "kl": 0.001987457275390625, "learning_rate": 9.885499109582536e-07, "loss": 0.0001, "num_tokens": 2181150.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 335.0, "completions/mean_terminated_length": 335.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.1653954235246889, "frac_reward_zero_std": 1.0, "grad_norm": 0.007739037854549642, "kl": 0.0003681182861328125, "learning_rate": 9.884554941406537e-07, "loss": 0.0, "num_tokens": 2184814.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.1656630536598421, "frac_reward_zero_std": 1.0, "grad_norm": 0.013225668090483261, "kl": 0.0009002685546875, "learning_rate": 9.88360694699681e-07, "loss": 0.0, "num_tokens": 2188030.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.16593068379499532, "frac_reward_zero_std": 1.0, "grad_norm": 0.01997731346954878, "kl": 0.0009517669677734375, "learning_rate": 9.882655127180635e-07, "loss": 0.0, "num_tokens": 2192389.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 387.5, "completions/mean_terminated_length": 387.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.16619831393014853, "frac_reward_zero_std": 0.5, "grad_norm": 0.557374815759338, "kl": 0.00038909912109375, "learning_rate": 9.881699482788625e-07, "loss": 0.0, "num_tokens": 2197033.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.16646594406530174, "frac_reward_zero_std": 1.0, "grad_norm": 0.0573315130765567, "kl": 0.00323486328125, "learning_rate": 9.88074001465474e-07, "loss": 0.0001, "num_tokens": 2200141.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 161.875, "completions/mean_terminated_length": 161.875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.16673357420045498, "frac_reward_zero_std": 1.0, "grad_norm": 0.01827509720931134, "kl": 0.000881195068359375, "learning_rate": 9.879776723616267e-07, "loss": 0.0, "num_tokens": 2202324.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 207.375, "completions/mean_terminated_length": 207.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1670012043356082, "frac_reward_zero_std": 1.0, "grad_norm": 0.12103946273373388, "kl": 0.0052642822265625, "learning_rate": 9.878809610513833e-07, "loss": 0.0002, "num_tokens": 2204843.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 205.25, "completions/mean_terminated_length": 205.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.1672688344707614, "frac_reward_zero_std": 1.0, "grad_norm": 0.01888846230730746, "kl": 0.00090789794921875, "learning_rate": 9.877838676191406e-07, "loss": 0.0, "num_tokens": 2207549.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.16753646460591462, "frac_reward_zero_std": 1.0, "grad_norm": 0.024078343861302517, "kl": 0.00142669677734375, "learning_rate": 9.876863921496276e-07, "loss": 0.0001, "num_tokens": 2210995.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 196.0, "completions/max_terminated_length": 196.0, "completions/mean_length": 158.875, "completions/mean_terminated_length": 158.875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.16780409474106783, "frac_reward_zero_std": 1.0, "grad_norm": 0.5824300156931301, "kl": 0.0170745849609375, "learning_rate": 9.87588534727908e-07, "loss": 0.0007, "num_tokens": 2213178.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.16807172487622107, "frac_reward_zero_std": 1.0, "grad_norm": 0.01436634420899522, "kl": 0.0006427764892578125, "learning_rate": 9.87490295439378e-07, "loss": 0.0, "num_tokens": 2216070.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.1683393550113743, "frac_reward_zero_std": 1.0, "grad_norm": 0.03992902822508017, "kl": 0.00142669677734375, "learning_rate": 9.873916743697673e-07, "loss": 0.0001, "num_tokens": 2219385.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 213.75, "completions/mean_terminated_length": 213.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.1686069851465275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0500944068866038, "kl": 0.001369476318359375, "learning_rate": 9.872926716051386e-07, "loss": 0.0001, "num_tokens": 2222011.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 478.125, "completions/mean_terminated_length": 478.125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.1688746152816807, "frac_reward_zero_std": 0.5, "grad_norm": 0.6521506600215328, "kl": 0.000652313232421875, "learning_rate": 9.871932872318881e-07, "loss": -0.0161, "num_tokens": 2226860.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.16914224541683393, "frac_reward_zero_std": 1.0, "grad_norm": 0.013362347546848601, "kl": 0.000457763671875, "learning_rate": 9.870935213367447e-07, "loss": 0.0, "num_tokens": 2230326.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 355.5, "completions/mean_terminated_length": 355.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.16940987555198717, "frac_reward_zero_std": 1.0, "grad_norm": 0.008472505197903239, "kl": 0.00041484832763671875, "learning_rate": 9.8699337400677e-07, "loss": 0.0, "num_tokens": 2234342.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.16967750568714038, "frac_reward_zero_std": 1.0, "grad_norm": 0.008096511235339504, "kl": 0.00047016143798828125, "learning_rate": 9.86892845329359e-07, "loss": 0.0, "num_tokens": 2237515.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.1699451358222936, "frac_reward_zero_std": 1.0, "grad_norm": 0.012720699850340285, "kl": 0.000476837158203125, "learning_rate": 9.867919353922395e-07, "loss": 0.0, "num_tokens": 2240881.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.1702127659574468, "frac_reward_zero_std": 1.0, "grad_norm": 0.02532077503002811, "kl": 0.000911712646484375, "learning_rate": 9.866906442834712e-07, "loss": 0.0, "num_tokens": 2244353.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 410.5, "completions/mean_terminated_length": 410.5, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.17048039609260002, "frac_reward_zero_std": 0.5, "grad_norm": 1.191121802051271, "kl": 0.0006198883056640625, "learning_rate": 9.865889720914472e-07, "loss": -0.0018, "num_tokens": 2248909.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 489.875, "completions/mean_terminated_length": 489.875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.17074802622775326, "frac_reward_zero_std": 0.5, "grad_norm": 1.1229698335030363, "kl": 0.00077056884765625, "learning_rate": 9.864869189048931e-07, "loss": 0.0479, "num_tokens": 2254092.0, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 372.5, "completions/mean_terminated_length": 372.5, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.17101565636290647, "frac_reward_zero_std": 1.0, "grad_norm": 0.009976023358537442, "kl": 0.000530242919921875, "learning_rate": 9.863844848128666e-07, "loss": 0.0, "num_tokens": 2258300.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.17128328649805968, "frac_reward_zero_std": 0.5, "grad_norm": 0.8095138862757361, "kl": 0.00063323974609375, "learning_rate": 9.862816699047583e-07, "loss": 0.0601, "num_tokens": 2261988.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 337.25, "completions/mean_terminated_length": 337.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.1715509166332129, "frac_reward_zero_std": 1.0, "grad_norm": 0.011222808993801715, "kl": 0.000339508056640625, "learning_rate": 9.861784742702904e-07, "loss": 0.0, "num_tokens": 2265590.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 346.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1718185467683661, "frac_reward_zero_std": 1.0, "grad_norm": 0.019281590610695107, "kl": 0.00077056884765625, "learning_rate": 9.860748979995182e-07, "loss": 0.0, "num_tokens": 2269458.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.17208617690351935, "frac_reward_zero_std": 1.0, "grad_norm": 0.011439901220098178, "kl": 0.00046443939208984375, "learning_rate": 9.859709411828288e-07, "loss": 0.0, "num_tokens": 2272657.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 348.875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.17235380703867256, "frac_reward_zero_std": 1.0, "grad_norm": 0.023589841532886735, "kl": 0.000766754150390625, "learning_rate": 9.85866603910941e-07, "loss": 0.0, "num_tokens": 2276728.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.17262143717382578, "frac_reward_zero_std": 1.0, "grad_norm": 0.026169969173681893, "kl": 0.000682830810546875, "learning_rate": 9.85761886274906e-07, "loss": 0.0, "num_tokens": 2279704.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 422.0, "completions/mean_terminated_length": 422.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.172889067308979, "frac_reward_zero_std": 1.0, "grad_norm": 0.031916002955368644, "kl": 0.0020294189453125, "learning_rate": 9.85656788366107e-07, "loss": 0.0001, "num_tokens": 2284276.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.1731566974441322, "frac_reward_zero_std": 1.0, "grad_norm": 0.00846245942665928, "kl": 0.0003223419189453125, "learning_rate": 9.85551310276259e-07, "loss": 0.0, "num_tokens": 2287530.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.17342432757928541, "frac_reward_zero_std": 1.0, "grad_norm": 0.009600128251665133, "kl": 0.00042724609375, "learning_rate": 9.854454520974088e-07, "loss": 0.0, "num_tokens": 2291631.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 397.375, "completions/mean_terminated_length": 397.375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.17369195771443866, "frac_reward_zero_std": 1.0, "grad_norm": 0.008859496419850022, "kl": 0.000415802001953125, "learning_rate": 9.853392139219343e-07, "loss": 0.0, "num_tokens": 2296022.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.17395958784959187, "frac_reward_zero_std": 1.0, "grad_norm": 0.017511883856281257, "kl": 0.0006847381591796875, "learning_rate": 9.852325958425463e-07, "loss": 0.0, "num_tokens": 2299265.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 296.25, "completions/mean_terminated_length": 296.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.17422721798474508, "frac_reward_zero_std": 1.0, "grad_norm": 0.014813248880906895, "kl": 0.000492095947265625, "learning_rate": 9.851255979522854e-07, "loss": 0.0, "num_tokens": 2302675.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 395.875, "completions/mean_terminated_length": 395.875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.1744948481198983, "frac_reward_zero_std": 0.5, "grad_norm": 0.7268881104049632, "kl": 0.0003757476806640625, "learning_rate": 9.850182203445253e-07, "loss": -0.0345, "num_tokens": 2306974.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.1747624782550515, "frac_reward_zero_std": 1.0, "grad_norm": 0.0809439841494722, "kl": 0.00125885009765625, "learning_rate": 9.849104631129699e-07, "loss": 0.0001, "num_tokens": 2309859.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.17503010839020475, "frac_reward_zero_std": 0.5, "grad_norm": 1.0002354290436166, "kl": 0.005069732666015625, "learning_rate": 9.84802326351655e-07, "loss": 0.0299, "num_tokens": 2313144.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.17529773852535796, "frac_reward_zero_std": 1.0, "grad_norm": 0.007958524434449603, "kl": 0.00037479400634765625, "learning_rate": 9.846938101549476e-07, "loss": 0.0, "num_tokens": 2317028.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 342.375, "completions/mean_terminated_length": 342.375, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.17556536866051117, "frac_reward_zero_std": 1.0, "grad_norm": 0.008857852385960568, "kl": 0.00043392181396484375, "learning_rate": 9.845849146175452e-07, "loss": 0.0, "num_tokens": 2320799.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 393.75, "completions/mean_terminated_length": 393.75, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.17583299879566439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0076781350463391435, "kl": 0.0004482269287109375, "learning_rate": 9.84475639834477e-07, "loss": 0.0, "num_tokens": 2325129.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1761006289308176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012223154624115298, "kl": 0.0003795623779296875, "learning_rate": 9.843659859011035e-07, "loss": 0.0, "num_tokens": 2328094.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.17636825906597084, "frac_reward_zero_std": 1.0, "grad_norm": 0.008351408959075901, "kl": 0.0004482269287109375, "learning_rate": 9.842559529131146e-07, "loss": 0.0, "num_tokens": 2331625.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 351.5, "completions/mean_terminated_length": 351.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.17663588920112405, "frac_reward_zero_std": 1.0, "grad_norm": 0.01758515040563569, "kl": 0.0008983612060546875, "learning_rate": 9.841455409665322e-07, "loss": 0.0, "num_tokens": 2335681.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.17690351933627727, "frac_reward_zero_std": 1.0, "grad_norm": 0.023302854443463982, "kl": 0.0010089874267578125, "learning_rate": 9.840347501577087e-07, "loss": 0.0, "num_tokens": 2339060.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.17717114947143048, "frac_reward_zero_std": 1.0, "grad_norm": 0.011624976494377234, "kl": 0.000957489013671875, "learning_rate": 9.839235805833268e-07, "loss": 0.0, "num_tokens": 2342276.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.1774387796065837, "frac_reward_zero_std": 1.0, "grad_norm": 0.009989507558792746, "kl": 0.000354766845703125, "learning_rate": 9.838120323404003e-07, "loss": 0.0, "num_tokens": 2345580.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.17770640974173693, "frac_reward_zero_std": 0.5, "grad_norm": 0.8644688875439224, "kl": 0.0007419586181640625, "learning_rate": 9.83700105526273e-07, "loss": 0.0081, "num_tokens": 2348794.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.17797403987689014, "frac_reward_zero_std": 1.0, "grad_norm": 0.014083764680760079, "kl": 0.0006542205810546875, "learning_rate": 9.83587800238619e-07, "loss": 0.0, "num_tokens": 2352067.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.17824167001204336, "frac_reward_zero_std": 0.5, "grad_norm": 0.8106242287410017, "kl": 0.0005807876586914062, "learning_rate": 9.834751165754428e-07, "loss": 0.0531, "num_tokens": 2355897.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 455.125, "completions/mean_terminated_length": 455.125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.17850930014719657, "frac_reward_zero_std": 1.0, "grad_norm": 0.012426594053818013, "kl": 0.0006542205810546875, "learning_rate": 9.833620546350792e-07, "loss": 0.0, "num_tokens": 2360694.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.17877693028234978, "frac_reward_zero_std": 1.0, "grad_norm": 0.02709023095351739, "kl": 0.00072479248046875, "learning_rate": 9.83248614516193e-07, "loss": 0.0, "num_tokens": 2363913.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.17904456041750302, "frac_reward_zero_std": 0.5, "grad_norm": 0.7953333390621233, "kl": 0.0005292892456054688, "learning_rate": 9.831347963177792e-07, "loss": -0.0438, "num_tokens": 2367617.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 288.625, "completions/mean_terminated_length": 288.625, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.17931219055265624, "frac_reward_zero_std": 1.0, "grad_norm": 0.008510976014122698, "kl": 0.0004329681396484375, "learning_rate": 9.830206001391626e-07, "loss": 0.0, "num_tokens": 2370902.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 355.75, "completions/mean_terminated_length": 355.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.17957982068780945, "frac_reward_zero_std": 1.0, "grad_norm": 0.03582563433327658, "kl": 0.0008792877197265625, "learning_rate": 9.829060260799978e-07, "loss": 0.0, "num_tokens": 2374968.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 340.875, "completions/mean_terminated_length": 340.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.17984745082296266, "frac_reward_zero_std": 1.0, "grad_norm": 0.01021182854606527, "kl": 0.0005359649658203125, "learning_rate": 9.827910742402691e-07, "loss": 0.0, "num_tokens": 2378699.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 347.0, "completions/mean_terminated_length": 347.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.18011508095811588, "frac_reward_zero_std": 1.0, "grad_norm": 0.018995745346175784, "kl": 0.0009822845458984375, "learning_rate": 9.826757447202906e-07, "loss": 0.0, "num_tokens": 2382759.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 408.875, "completions/mean_terminated_length": 408.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.18038271109326912, "frac_reward_zero_std": 1.0, "grad_norm": 0.02651449511555408, "kl": 0.000591278076171875, "learning_rate": 9.82560037620706e-07, "loss": 0.0, "num_tokens": 2387386.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.18065034122842233, "frac_reward_zero_std": 1.0, "grad_norm": 0.015431123561473355, "kl": 0.000858306884765625, "learning_rate": 9.824439530424886e-07, "loss": 0.0, "num_tokens": 2390324.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 236.375, "completions/mean_terminated_length": 236.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.18091797136357554, "frac_reward_zero_std": 1.0, "grad_norm": 0.007663050544659845, "kl": 0.0002727508544921875, "learning_rate": 9.82327491086941e-07, "loss": 0.0, "num_tokens": 2393207.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.18118560149872875, "frac_reward_zero_std": 1.0, "grad_norm": 0.06810888461983963, "kl": 0.0018463134765625, "learning_rate": 9.822106518556948e-07, "loss": 0.0001, "num_tokens": 2396425.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 313.125, "completions/mean_terminated_length": 313.125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.18145323163388197, "frac_reward_zero_std": 1.0, "grad_norm": 0.08728519312527641, "kl": 0.002536773681640625, "learning_rate": 9.820934354507112e-07, "loss": 0.0001, "num_tokens": 2399934.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.18172086176903518, "frac_reward_zero_std": 1.0, "grad_norm": 0.022488035578916522, "kl": 0.0008754730224609375, "learning_rate": 9.819758419742808e-07, "loss": 0.0, "num_tokens": 2403293.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.18198849190418842, "frac_reward_zero_std": 1.0, "grad_norm": 0.012297334515520342, "kl": 0.0005893707275390625, "learning_rate": 9.818578715290227e-07, "loss": 0.0, "num_tokens": 2406591.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.18225612203934163, "frac_reward_zero_std": 1.0, "grad_norm": 0.02691815628984148, "kl": 0.00153350830078125, "learning_rate": 9.817395242178853e-07, "loss": 0.0001, "num_tokens": 2410161.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.18252375217449485, "frac_reward_zero_std": 1.0, "grad_norm": 0.04758235070530089, "kl": 0.00199127197265625, "learning_rate": 9.816208001441456e-07, "loss": 0.0001, "num_tokens": 2413870.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.18279138230964806, "frac_reward_zero_std": 1.0, "grad_norm": 0.02130022632489111, "kl": 0.000934600830078125, "learning_rate": 9.815016994114098e-07, "loss": 0.0, "num_tokens": 2417069.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.18305901244480127, "frac_reward_zero_std": 1.0, "grad_norm": 0.02436692938340624, "kl": 0.001056671142578125, "learning_rate": 9.813822221236125e-07, "loss": 0.0, "num_tokens": 2420194.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1833266425799545, "frac_reward_zero_std": 1.0, "grad_norm": 0.032943759356503656, "kl": 0.0010242462158203125, "learning_rate": 9.812623683850166e-07, "loss": 0.0, "num_tokens": 2423137.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.18359427271510773, "frac_reward_zero_std": 1.0, "grad_norm": 0.015488633151819756, "kl": 0.0007038116455078125, "learning_rate": 9.811421383002145e-07, "loss": 0.0, "num_tokens": 2426029.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 496.375, "completions/mean_terminated_length": 421.0000305175781, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.18386190285026094, "frac_reward_zero_std": 0.5, "grad_norm": 0.6239607606673048, "kl": 0.0006275177001953125, "learning_rate": 9.81021531974126e-07, "loss": 0.0708, "num_tokens": 2431456.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 388.5, "completions/mean_terminated_length": 297.71429443359375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.18412953298541415, "frac_reward_zero_std": 0.5, "grad_norm": 0.49767083993824, "kl": 0.0006542205810546875, "learning_rate": 9.809005495119998e-07, "loss": 0.127, "num_tokens": 2435568.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.18439716312056736, "frac_reward_zero_std": 1.0, "grad_norm": 0.02088433508378464, "kl": 0.000736236572265625, "learning_rate": 9.807791910194127e-07, "loss": 0.0, "num_tokens": 2438734.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 397.125, "completions/mean_terminated_length": 397.125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.1846647932557206, "frac_reward_zero_std": 1.0, "grad_norm": 0.025081567607607776, "kl": 0.0008392333984375, "learning_rate": 9.806574566022693e-07, "loss": 0.0, "num_tokens": 2442931.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.18493242339087382, "frac_reward_zero_std": 1.0, "grad_norm": 0.033280919908520404, "kl": 0.0014801025390625, "learning_rate": 9.80535346366803e-07, "loss": 0.0001, "num_tokens": 2446129.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.18520005352602703, "frac_reward_zero_std": 1.0, "grad_norm": 0.06901287765401226, "kl": 0.001922607421875, "learning_rate": 9.804128604195744e-07, "loss": 0.0001, "num_tokens": 2449357.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.18546768366118024, "frac_reward_zero_std": 1.0, "grad_norm": 0.04461571079515624, "kl": 0.001384735107421875, "learning_rate": 9.802899988674726e-07, "loss": 0.0001, "num_tokens": 2452870.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 347.75, "completions/mean_terminated_length": 347.75, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.18573531379633346, "frac_reward_zero_std": 1.0, "grad_norm": 0.01281707998772548, "kl": 0.0004329681396484375, "learning_rate": 9.801667618177141e-07, "loss": 0.0, "num_tokens": 2456756.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.1860029439314867, "frac_reward_zero_std": 1.0, "grad_norm": 0.04753302222708019, "kl": 0.001506805419921875, "learning_rate": 9.800431493778433e-07, "loss": 0.0001, "num_tokens": 2459988.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.1862705740666399, "frac_reward_zero_std": 1.0, "grad_norm": 0.043566497763973595, "kl": 0.0010986328125, "learning_rate": 9.799191616557315e-07, "loss": 0.0, "num_tokens": 2463490.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.18653820420179312, "frac_reward_zero_std": 1.0, "grad_norm": 0.017557050512735515, "kl": 0.0007305145263671875, "learning_rate": 9.797947987595788e-07, "loss": 0.0, "num_tokens": 2466283.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 345.875, "completions/mean_terminated_length": 345.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.18680583433694634, "frac_reward_zero_std": 0.5, "grad_norm": 0.8489196450363393, "kl": 0.001560211181640625, "learning_rate": 9.796700607979113e-07, "loss": 0.0825, "num_tokens": 2470034.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 208.75, "completions/mean_terminated_length": 208.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.18707346447209955, "frac_reward_zero_std": 1.0, "grad_norm": 0.01524664157431236, "kl": 0.0005273818969726562, "learning_rate": 9.795449478795834e-07, "loss": 0.0, "num_tokens": 2472620.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.1873410946072528, "frac_reward_zero_std": 0.5, "grad_norm": 1.864222479581708, "kl": 0.0007419586181640625, "learning_rate": 9.794194601137766e-07, "loss": -0.0224, "num_tokens": 2476521.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.187608724742406, "frac_reward_zero_std": 1.0, "grad_norm": 0.01555902696580503, "kl": 0.000598907470703125, "learning_rate": 9.792935976099987e-07, "loss": 0.0, "num_tokens": 2479152.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.18787635487755922, "frac_reward_zero_std": 1.0, "grad_norm": 0.020680716968606017, "kl": 0.0005817413330078125, "learning_rate": 9.791673604780856e-07, "loss": 0.0, "num_tokens": 2482611.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 369.875, "completions/mean_terminated_length": 369.875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.18814398501271243, "frac_reward_zero_std": 0.5, "grad_norm": 0.7384000681174067, "kl": 0.001094818115234375, "learning_rate": 9.790407488281992e-07, "loss": 0.001, "num_tokens": 2486642.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.18841161514786564, "frac_reward_zero_std": 1.0, "grad_norm": 0.017297563104944937, "kl": 0.00054168701171875, "learning_rate": 9.78913762770829e-07, "loss": 0.0, "num_tokens": 2490205.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.18867924528301888, "frac_reward_zero_std": 1.0, "grad_norm": 0.022717602844193215, "kl": 0.00099945068359375, "learning_rate": 9.78786402416791e-07, "loss": 0.0, "num_tokens": 2493609.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 280.25, "completions/mean_terminated_length": 280.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.1889468754181721, "frac_reward_zero_std": 1.0, "grad_norm": 0.017138157595234703, "kl": 0.0006427764892578125, "learning_rate": 9.786586678772273e-07, "loss": 0.0, "num_tokens": 2496947.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.1892145055533253, "frac_reward_zero_std": 1.0, "grad_norm": 0.013833061597378455, "kl": 0.0007038116455078125, "learning_rate": 9.78530559263607e-07, "loss": 0.0, "num_tokens": 2500410.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 309.625, "completions/mean_terminated_length": 309.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.18948213568847852, "frac_reward_zero_std": 1.0, "grad_norm": 0.011883121456898488, "kl": 0.000774383544921875, "learning_rate": 9.784020766877259e-07, "loss": 0.0, "num_tokens": 2503903.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 420.5, "completions/mean_terminated_length": 420.5, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.18974976582363173, "frac_reward_zero_std": 1.0, "grad_norm": 0.007940279006972677, "kl": 0.0005588531494140625, "learning_rate": 9.782732202617057e-07, "loss": 0.0, "num_tokens": 2508371.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.19001739595878495, "frac_reward_zero_std": 1.0, "grad_norm": 0.04555403391020526, "kl": 0.0012054443359375, "learning_rate": 9.781439900979944e-07, "loss": 0.0, "num_tokens": 2512292.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.1902850260939382, "frac_reward_zero_std": 1.0, "grad_norm": 0.00953148209477173, "kl": 0.0005321502685546875, "learning_rate": 9.780143863093664e-07, "loss": 0.0, "num_tokens": 2516341.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 630.125, "completions/mean_terminated_length": 498.8333435058594, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.1905526562290914, "frac_reward_zero_std": 0.5, "grad_norm": 1.1252205873386907, "kl": 0.000751495361328125, "learning_rate": 9.778844090089217e-07, "loss": 0.0659, "num_tokens": 2522886.0, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.1908202863642446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03614997035957678, "kl": 0.0011463165283203125, "learning_rate": 9.777540583100869e-07, "loss": 0.0, "num_tokens": 2525831.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 320.875, "completions/mean_terminated_length": 320.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.19108791649939783, "frac_reward_zero_std": 1.0, "grad_norm": 0.009314953291850623, "kl": 0.0006036758422851562, "learning_rate": 9.776233343266137e-07, "loss": 0.0, "num_tokens": 2529358.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.19135554663455104, "frac_reward_zero_std": 1.0, "grad_norm": 0.0172343185865694, "kl": 0.0009469985961914062, "learning_rate": 9.7749223717258e-07, "loss": 0.0, "num_tokens": 2532757.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 317.375, "completions/mean_terminated_length": 317.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.19162317676970428, "frac_reward_zero_std": 1.0, "grad_norm": 0.006709819959730349, "kl": 0.0002799034118652344, "learning_rate": 9.773607669623899e-07, "loss": 0.0, "num_tokens": 2536328.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 228.875, "completions/mean_terminated_length": 228.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1918908069048575, "frac_reward_zero_std": 1.0, "grad_norm": 0.013331471349391128, "kl": 0.0005435943603515625, "learning_rate": 9.772289238107714e-07, "loss": 0.0, "num_tokens": 2539255.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 278.25, "completions/mean_terminated_length": 278.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1921584370400107, "frac_reward_zero_std": 1.0, "grad_norm": 0.013898572371945308, "kl": 0.0009517669677734375, "learning_rate": 9.770967078327799e-07, "loss": 0.0, "num_tokens": 2542653.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.19242606717516392, "frac_reward_zero_std": 1.0, "grad_norm": 0.047531876813710495, "kl": 0.001445770263671875, "learning_rate": 9.769641191437947e-07, "loss": 0.0001, "num_tokens": 2546393.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 251.0, "completions/mean_terminated_length": 251.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.19269369731031713, "frac_reward_zero_std": 1.0, "grad_norm": 0.02464160834957382, "kl": 0.001781463623046875, "learning_rate": 9.768311578595211e-07, "loss": 0.0001, "num_tokens": 2549425.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 318.125, "completions/mean_terminated_length": 318.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.19296132744547037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01179689394940405, "kl": 0.000591278076171875, "learning_rate": 9.766978240959892e-07, "loss": 0.0, "num_tokens": 2553158.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.19322895758062358, "frac_reward_zero_std": 1.0, "grad_norm": 0.010694232150206507, "kl": 0.000446319580078125, "learning_rate": 9.765641179695542e-07, "loss": 0.0, "num_tokens": 2556251.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 228.25, "completions/mean_terminated_length": 228.25, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1934965877157768, "frac_reward_zero_std": 1.0, "grad_norm": 0.0098392080094132, "kl": 0.00225830078125, "learning_rate": 9.764300395968968e-07, "loss": 0.0001, "num_tokens": 2559349.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 392.625, "completions/mean_terminated_length": 392.625, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.19376421785093, "frac_reward_zero_std": 0.5, "grad_norm": 1.1369760805306226, "kl": 0.00110626220703125, "learning_rate": 9.762955890950217e-07, "loss": -0.005, "num_tokens": 2563594.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 432.625, "completions/mean_terminated_length": 432.625, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.19403184798608322, "frac_reward_zero_std": 1.0, "grad_norm": 0.029395803611922865, "kl": 0.0011692047119140625, "learning_rate": 9.761607665812586e-07, "loss": 0.0, "num_tokens": 2568315.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 346.25, "completions/mean_terminated_length": 346.25, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.19429947812123646, "frac_reward_zero_std": 1.0, "grad_norm": 0.06715956816731543, "kl": 0.001934051513671875, "learning_rate": 9.760255721732624e-07, "loss": 0.0001, "num_tokens": 2572125.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 341.625, "completions/mean_terminated_length": 341.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.19456710825638968, "frac_reward_zero_std": 1.0, "grad_norm": 0.06588980022334437, "kl": 0.002655029296875, "learning_rate": 9.758900059890117e-07, "loss": 0.0001, "num_tokens": 2575930.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 352.375, "completions/mean_terminated_length": 352.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1948347383915429, "frac_reward_zero_std": 1.0, "grad_norm": 0.014020817080025054, "kl": 0.000812530517578125, "learning_rate": 9.757540681468104e-07, "loss": 0.0, "num_tokens": 2580029.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 393.0, "completions/mean_terminated_length": 393.0, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.1951023685266961, "frac_reward_zero_std": 1.0, "grad_norm": 0.07603211590515702, "kl": 0.002033233642578125, "learning_rate": 9.756177587652854e-07, "loss": 0.0001, "num_tokens": 2584169.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 377.5, "completions/mean_terminated_length": 377.5, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.19536999866184931, "frac_reward_zero_std": 1.0, "grad_norm": 0.020700268093833442, "kl": 0.00122833251953125, "learning_rate": 9.754810779633897e-07, "loss": 0.0, "num_tokens": 2588313.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 164.0, "completions/mean_terminated_length": 164.0, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.19563762879700256, "frac_reward_zero_std": 0.5, "grad_norm": 1.2591439950475471, "kl": 0.0006237030029296875, "learning_rate": 9.753440258603988e-07, "loss": -0.0431, "num_tokens": 2590525.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.19590525893215577, "frac_reward_zero_std": 1.0, "grad_norm": 0.013441437204654731, "kl": 0.0005550384521484375, "learning_rate": 9.75206602575913e-07, "loss": 0.0, "num_tokens": 2593968.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.19617288906730898, "frac_reward_zero_std": 0.5, "grad_norm": 1.4364746570503641, "kl": 0.00440216064453125, "learning_rate": 9.750688082298565e-07, "loss": 0.0226, "num_tokens": 2597379.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 310.875, "completions/mean_terminated_length": 310.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.1964405192024622, "frac_reward_zero_std": 1.0, "grad_norm": 0.15124876073433996, "kl": 0.0054168701171875, "learning_rate": 9.749306429424768e-07, "loss": 0.0002, "num_tokens": 2600862.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.1967081493376154, "frac_reward_zero_std": 0.5, "grad_norm": 1.2221488981158781, "kl": 0.0030975341796875, "learning_rate": 9.747921068343458e-07, "loss": -0.1155, "num_tokens": 2604173.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.19697577947276865, "frac_reward_zero_std": 1.0, "grad_norm": 0.15950855934705496, "kl": 0.0058917999267578125, "learning_rate": 9.746532000263586e-07, "loss": 0.0002, "num_tokens": 2606614.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.19724340960792186, "frac_reward_zero_std": 1.0, "grad_norm": 0.029683373394467884, "kl": 0.0011844635009765625, "learning_rate": 9.74513922639734e-07, "loss": 0.0, "num_tokens": 2609680.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.19751103974307507, "frac_reward_zero_std": 1.0, "grad_norm": 0.013768492087091843, "kl": 0.0009307861328125, "learning_rate": 9.743742747960136e-07, "loss": 0.0, "num_tokens": 2612966.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.19777866987822829, "frac_reward_zero_std": 0.5, "grad_norm": 0.8496234238907587, "kl": 0.00206756591796875, "learning_rate": 9.742342566170632e-07, "loss": 0.0138, "num_tokens": 2616699.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.1980463000133815, "frac_reward_zero_std": 1.0, "grad_norm": 0.0736470360059211, "kl": 0.0029144287109375, "learning_rate": 9.740938682250712e-07, "loss": 0.0001, "num_tokens": 2619964.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1983139301485347, "frac_reward_zero_std": 1.0, "grad_norm": 0.04921141573682258, "kl": 0.00260162353515625, "learning_rate": 9.73953109742549e-07, "loss": 0.0001, "num_tokens": 2623426.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.19858156028368795, "frac_reward_zero_std": 1.0, "grad_norm": 0.018751056316985346, "kl": 0.00133514404296875, "learning_rate": 9.738119812923314e-07, "loss": 0.0001, "num_tokens": 2626683.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.19884919041884117, "frac_reward_zero_std": 1.0, "grad_norm": 0.03959745536576237, "kl": 0.00185394287109375, "learning_rate": 9.736704829975756e-07, "loss": 0.0001, "num_tokens": 2629858.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.19911682055399438, "frac_reward_zero_std": 1.0, "grad_norm": 0.008671434012566986, "kl": 0.00055694580078125, "learning_rate": 9.735286149817621e-07, "loss": 0.0, "num_tokens": 2632914.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 343.25, "completions/mean_terminated_length": 343.25, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.1993844506891476, "frac_reward_zero_std": 1.0, "grad_norm": 0.02343619560312013, "kl": 0.00128173828125, "learning_rate": 9.733863773686934e-07, "loss": 0.0001, "num_tokens": 2636892.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 364.125, "completions/mean_terminated_length": 364.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.1996520808243008, "frac_reward_zero_std": 1.0, "grad_norm": 0.020421956611995026, "kl": 0.0009212493896484375, "learning_rate": 9.73243770282495e-07, "loss": 0.0, "num_tokens": 2640985.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.19991971095945404, "frac_reward_zero_std": 1.0, "grad_norm": 0.01190376563133536, "kl": 0.0008106231689453125, "learning_rate": 9.731007938476143e-07, "loss": 0.0, "num_tokens": 2644200.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.20018734109460726, "frac_reward_zero_std": 1.0, "grad_norm": 0.016329127200653732, "kl": 0.00089263916015625, "learning_rate": 9.72957448188822e-07, "loss": 0.0, "num_tokens": 2647371.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.20045497122976047, "frac_reward_zero_std": 1.0, "grad_norm": 0.04314760028319644, "kl": 0.00139617919921875, "learning_rate": 9.728137334312096e-07, "loss": 0.0001, "num_tokens": 2650707.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.20072260136491368, "frac_reward_zero_std": 1.0, "grad_norm": 0.06024372510703003, "kl": 0.001575469970703125, "learning_rate": 9.726696497001923e-07, "loss": 0.0001, "num_tokens": 2653965.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.2009902315000669, "frac_reward_zero_std": 1.0, "grad_norm": 0.02935872789841555, "kl": 0.0012292861938476562, "learning_rate": 9.725251971215059e-07, "loss": 0.0, "num_tokens": 2657108.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.20125786163522014, "frac_reward_zero_std": 1.0, "grad_norm": 0.02238803626161564, "kl": 0.000873565673828125, "learning_rate": 9.72380375821209e-07, "loss": 0.0, "num_tokens": 2660588.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 338.125, "completions/mean_terminated_length": 338.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.20152549177037335, "frac_reward_zero_std": 1.0, "grad_norm": 0.008754845054402723, "kl": 0.0005931854248046875, "learning_rate": 9.722351859256813e-07, "loss": 0.0, "num_tokens": 2664429.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 290.625, "completions/mean_terminated_length": 290.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.20179312190552656, "frac_reward_zero_std": 1.0, "grad_norm": 0.01012953553700475, "kl": 0.000652313232421875, "learning_rate": 9.72089627561625e-07, "loss": 0.0, "num_tokens": 2668062.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 369.375, "completions/mean_terminated_length": 369.375, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.20206075204067978, "frac_reward_zero_std": 1.0, "grad_norm": 0.015194538849036784, "kl": 0.0007801055908203125, "learning_rate": 9.719437008560624e-07, "loss": 0.0, "num_tokens": 2672077.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.202328382175833, "frac_reward_zero_std": 1.0, "grad_norm": 0.04863007417688136, "kl": 0.0017108917236328125, "learning_rate": 9.717974059363392e-07, "loss": 0.0001, "num_tokens": 2674886.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 337.0, "completions/mean_terminated_length": 337.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.20259601231098623, "frac_reward_zero_std": 1.0, "grad_norm": 0.0302408868696162, "kl": 0.001155853271484375, "learning_rate": 9.716507429301207e-07, "loss": 0.0, "num_tokens": 2678722.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.20286364244613944, "frac_reward_zero_std": 1.0, "grad_norm": 0.007679785267197617, "kl": 0.0004444122314453125, "learning_rate": 9.715037119653943e-07, "loss": 0.0, "num_tokens": 2682366.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.20313127258129265, "frac_reward_zero_std": 1.0, "grad_norm": 0.042017298182203934, "kl": 0.001026153564453125, "learning_rate": 9.713563131704685e-07, "loss": 0.0, "num_tokens": 2685627.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.20339890271644587, "frac_reward_zero_std": 0.5, "grad_norm": 1.0124456206640313, "kl": 0.0010967254638671875, "learning_rate": 9.712085466739725e-07, "loss": -0.038, "num_tokens": 2688821.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.20366653285159908, "frac_reward_zero_std": 1.0, "grad_norm": 0.012317286493375726, "kl": 0.000919342041015625, "learning_rate": 9.710604126048564e-07, "loss": 0.0, "num_tokens": 2691815.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 247.125, "completions/mean_terminated_length": 247.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.20393416298675232, "frac_reward_zero_std": 1.0, "grad_norm": 0.013657618110400734, "kl": 0.0006618499755859375, "learning_rate": 9.70911911092391e-07, "loss": 0.0, "num_tokens": 2694884.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.20420179312190553, "frac_reward_zero_std": 1.0, "grad_norm": 0.011878275954036956, "kl": 0.001087188720703125, "learning_rate": 9.707630422661683e-07, "loss": 0.0, "num_tokens": 2698184.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.20446942325705875, "frac_reward_zero_std": 1.0, "grad_norm": 0.010303108083667583, "kl": 0.0005245208740234375, "learning_rate": 9.706138062561e-07, "loss": 0.0, "num_tokens": 2701827.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 313.125, "completions/mean_terminated_length": 313.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.20473705339221196, "frac_reward_zero_std": 1.0, "grad_norm": 0.008301601063712841, "kl": 0.000579833984375, "learning_rate": 9.70464203192419e-07, "loss": 0.0, "num_tokens": 2705364.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 320.125, "completions/mean_terminated_length": 320.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.20500468352736517, "frac_reward_zero_std": 1.0, "grad_norm": 0.014756938409029849, "kl": 0.000759124755859375, "learning_rate": 9.703142332056781e-07, "loss": 0.0, "num_tokens": 2708905.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 313.875, "completions/mean_terminated_length": 313.875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.2052723136625184, "frac_reward_zero_std": 0.5, "grad_norm": 0.8780589768631673, "kl": 0.000705718994140625, "learning_rate": 9.701638964267504e-07, "loss": -0.0273, "num_tokens": 2712688.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.20553994379767163, "frac_reward_zero_std": 0.5, "grad_norm": 0.5768201858439206, "kl": 0.000637054443359375, "learning_rate": 9.700131929868288e-07, "loss": -0.065, "num_tokens": 2716461.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 392.625, "completions/mean_terminated_length": 392.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.20580757393282484, "frac_reward_zero_std": 1.0, "grad_norm": 0.010346905243226314, "kl": 0.0006885528564453125, "learning_rate": 9.698621230174268e-07, "loss": 0.0, "num_tokens": 2720678.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.20607520406797805, "frac_reward_zero_std": 0.5, "grad_norm": 0.7771558408152117, "kl": 0.00101470947265625, "learning_rate": 9.697106866503773e-07, "loss": 0.0424, "num_tokens": 2724945.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 240.625, "completions/mean_terminated_length": 240.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.20634283420313126, "frac_reward_zero_std": 1.0, "grad_norm": 0.026434394495502413, "kl": 0.0019073486328125, "learning_rate": 9.69558884017833e-07, "loss": 0.0001, "num_tokens": 2727870.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 380.375, "completions/mean_terminated_length": 380.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.20661046433828448, "frac_reward_zero_std": 1.0, "grad_norm": 0.018486345888426275, "kl": 0.0009613037109375, "learning_rate": 9.694067152522662e-07, "loss": 0.0, "num_tokens": 2731973.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 480.375, "completions/mean_terminated_length": 480.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.20687809447343772, "frac_reward_zero_std": 1.0, "grad_norm": 0.02153466551111585, "kl": 0.0011444091796875, "learning_rate": 9.692541804864688e-07, "loss": 0.0, "num_tokens": 2736888.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.20714572460859093, "frac_reward_zero_std": 1.0, "grad_norm": 0.02470685535159439, "kl": 0.001922607421875, "learning_rate": 9.691012798535522e-07, "loss": 0.0001, "num_tokens": 2740317.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 358.75, "completions/mean_terminated_length": 358.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.20741335474374414, "frac_reward_zero_std": 0.5, "grad_norm": 0.9947478752524276, "kl": 0.001743316650390625, "learning_rate": 9.68948013486947e-07, "loss": 0.012, "num_tokens": 2744283.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.20768098487889736, "frac_reward_zero_std": 1.0, "grad_norm": 0.029056254620653796, "kl": 0.001544952392578125, "learning_rate": 9.687943815204024e-07, "loss": 0.0001, "num_tokens": 2747231.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.20794861501405057, "frac_reward_zero_std": 1.0, "grad_norm": 0.030920679751158114, "kl": 0.0015869140625, "learning_rate": 9.686403840879875e-07, "loss": 0.0001, "num_tokens": 2750098.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 455.375, "completions/mean_terminated_length": 455.375, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 0.2082162451492038, "frac_reward_zero_std": 1.0, "grad_norm": 0.013069804951818921, "kl": 0.000701904296875, "learning_rate": 9.684860213240904e-07, "loss": 0.0, "num_tokens": 2754769.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 785.0, "completions/max_terminated_length": 785.0, "completions/mean_length": 434.625, "completions/mean_terminated_length": 434.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.20848387528435702, "frac_reward_zero_std": 1.0, "grad_norm": 0.009531184377364338, "kl": 0.0006160736083984375, "learning_rate": 9.68331293363417e-07, "loss": 0.0, "num_tokens": 2759382.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.20875150541951024, "frac_reward_zero_std": 0.5, "grad_norm": 0.7120694286120975, "kl": 0.00235748291015625, "learning_rate": 9.681762003409926e-07, "loss": 0.0855, "num_tokens": 2762987.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.20901913555466345, "frac_reward_zero_std": 1.0, "grad_norm": 0.018438784997655427, "kl": 0.001247406005859375, "learning_rate": 9.680207423921607e-07, "loss": 0.0, "num_tokens": 2766955.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.20928676568981666, "frac_reward_zero_std": 1.0, "grad_norm": 0.00821003495687319, "kl": 0.00048351287841796875, "learning_rate": 9.678649196525842e-07, "loss": 0.0, "num_tokens": 2769931.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 236.75, "completions/mean_terminated_length": 236.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.2095543958249699, "frac_reward_zero_std": 1.0, "grad_norm": 0.02352526842232261, "kl": 0.0018157958984375, "learning_rate": 9.677087322582432e-07, "loss": 0.0001, "num_tokens": 2772765.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.20982202596012312, "frac_reward_zero_std": 1.0, "grad_norm": 0.020029603321185332, "kl": 0.00113677978515625, "learning_rate": 9.675521803454368e-07, "loss": 0.0, "num_tokens": 2776046.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.21008965609527633, "frac_reward_zero_std": 1.0, "grad_norm": 0.03819120192298926, "kl": 0.0007305145263671875, "learning_rate": 9.673952640507813e-07, "loss": 0.0, "num_tokens": 2778689.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.21035728623042954, "frac_reward_zero_std": 0.5, "grad_norm": 0.8332420638166009, "kl": 0.000782012939453125, "learning_rate": 9.672379835112122e-07, "loss": 0.0116, "num_tokens": 2782237.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.21062491636558275, "frac_reward_zero_std": 1.0, "grad_norm": 0.013145130554062103, "kl": 0.0009174346923828125, "learning_rate": 9.670803388639817e-07, "loss": 0.0, "num_tokens": 2785641.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.210892546500736, "frac_reward_zero_std": 1.0, "grad_norm": 0.009310579174553201, "kl": 0.0005950927734375, "learning_rate": 9.669223302466608e-07, "loss": 0.0, "num_tokens": 2789071.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 418.375, "completions/mean_terminated_length": 418.375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.2111601766358892, "frac_reward_zero_std": 0.5, "grad_norm": 0.4706791321305623, "kl": 0.000690460205078125, "learning_rate": 9.66763957797137e-07, "loss": 0.0186, "num_tokens": 2793790.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 302.75, "completions/mean_terminated_length": 302.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.21142780677104242, "frac_reward_zero_std": 1.0, "grad_norm": 0.03702019266780612, "kl": 0.000888824462890625, "learning_rate": 9.666052216536163e-07, "loss": 0.0, "num_tokens": 2797232.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.21169543690619563, "frac_reward_zero_std": 1.0, "grad_norm": 0.01848269532852315, "kl": 0.000873565673828125, "learning_rate": 9.664461219546217e-07, "loss": 0.0, "num_tokens": 2800285.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.21196306704134885, "frac_reward_zero_std": 1.0, "grad_norm": 0.012619273851234965, "kl": 0.000576019287109375, "learning_rate": 9.66286658838993e-07, "loss": 0.0, "num_tokens": 2803722.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.2122306971765021, "frac_reward_zero_std": 1.0, "grad_norm": 0.018236536306125734, "kl": 0.0008182525634765625, "learning_rate": 9.661268324458882e-07, "loss": 0.0, "num_tokens": 2807338.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 221.375, "completions/mean_terminated_length": 221.375, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.2124983273116553, "frac_reward_zero_std": 1.0, "grad_norm": 0.03088037135377545, "kl": 0.001308441162109375, "learning_rate": 9.65966642914781e-07, "loss": 0.0001, "num_tokens": 2810129.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.2127659574468085, "frac_reward_zero_std": 1.0, "grad_norm": 0.019934616034637663, "kl": 0.00119781494140625, "learning_rate": 9.658060903854633e-07, "loss": 0.0, "num_tokens": 2813440.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 350.25, "completions/mean_terminated_length": 350.25, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.21303358758196173, "frac_reward_zero_std": 0.0, "grad_norm": 1.880287797101617, "kl": 0.0007610321044921875, "learning_rate": 9.656451749980427e-07, "loss": 0.0436, "num_tokens": 2817738.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 406.25, "completions/mean_terminated_length": 406.25, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.21330121771711494, "frac_reward_zero_std": 1.0, "grad_norm": 0.009404154869072345, "kl": 0.0006771087646484375, "learning_rate": 9.654838968929444e-07, "loss": 0.0, "num_tokens": 2822040.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.21356884785226818, "frac_reward_zero_std": 0.5, "grad_norm": 1.0503675676548765, "kl": 0.000644683837890625, "learning_rate": 9.65322256210909e-07, "loss": -0.0395, "num_tokens": 2825195.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 264.5, "completions/mean_terminated_length": 264.5, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.2138364779874214, "frac_reward_zero_std": 1.0, "grad_norm": 0.014593074976104228, "kl": 0.0006504058837890625, "learning_rate": 9.651602530929947e-07, "loss": 0.0, "num_tokens": 2828335.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.2141041081225746, "frac_reward_zero_std": 1.0, "grad_norm": 0.022945869000246075, "kl": 0.0008602142333984375, "learning_rate": 9.649978876805752e-07, "loss": 0.0, "num_tokens": 2831759.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 243.25, "completions/mean_terminated_length": 243.25, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.21437173825772782, "frac_reward_zero_std": 1.0, "grad_norm": 0.007158909340396765, "kl": 0.00029277801513671875, "learning_rate": 9.648351601153408e-07, "loss": 0.0, "num_tokens": 2834705.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 361.75, "completions/mean_terminated_length": 361.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.21463936839288103, "frac_reward_zero_std": 1.0, "grad_norm": 0.02159812219914637, "kl": 0.0010528564453125, "learning_rate": 9.646720705392976e-07, "loss": 0.0, "num_tokens": 2838587.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.21490699852803424, "frac_reward_zero_std": 0.5, "grad_norm": 0.8555905717005353, "kl": 0.001678466796875, "learning_rate": 9.645086190947674e-07, "loss": 0.0001, "num_tokens": 2841933.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.21517462866318748, "frac_reward_zero_std": 1.0, "grad_norm": 0.049565152636250706, "kl": 0.0021209716796875, "learning_rate": 9.64344805924388e-07, "loss": 0.0001, "num_tokens": 2844684.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.2154422587983407, "frac_reward_zero_std": 1.0, "grad_norm": 0.01021215105037191, "kl": 0.0006208419799804688, "learning_rate": 9.641806311711133e-07, "loss": 0.0, "num_tokens": 2848288.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.2157098889334939, "frac_reward_zero_std": 1.0, "grad_norm": 0.03044781734204542, "kl": 0.00128936767578125, "learning_rate": 9.64016094978212e-07, "loss": 0.0001, "num_tokens": 2851942.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.21597751906864712, "frac_reward_zero_std": 1.0, "grad_norm": 0.0293684137901894, "kl": 0.0009975433349609375, "learning_rate": 9.638511974892688e-07, "loss": 0.0, "num_tokens": 2855103.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.21624514920380034, "frac_reward_zero_std": 1.0, "grad_norm": 0.07151350619446428, "kl": 0.002445220947265625, "learning_rate": 9.636859388481832e-07, "loss": 0.0001, "num_tokens": 2858292.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 251.5, "completions/mean_terminated_length": 251.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.21651277933895358, "frac_reward_zero_std": 1.0, "grad_norm": 0.01948623358570967, "kl": 0.0009365081787109375, "learning_rate": 9.635203191991701e-07, "loss": 0.0, "num_tokens": 2861324.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.2167804094741068, "frac_reward_zero_std": 0.5, "grad_norm": 0.8844055918326096, "kl": 0.0008945465087890625, "learning_rate": 9.633543386867598e-07, "loss": 0.0284, "num_tokens": 2864310.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.21704803960926, "frac_reward_zero_std": 1.0, "grad_norm": 0.057048489143493905, "kl": 0.00257110595703125, "learning_rate": 9.631879974557966e-07, "loss": 0.0001, "num_tokens": 2867323.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.21731566974441321, "frac_reward_zero_std": 1.0, "grad_norm": 0.03433393892501471, "kl": 0.0014781951904296875, "learning_rate": 9.63021295651441e-07, "loss": 0.0001, "num_tokens": 2870436.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 285.5, "completions/mean_terminated_length": 285.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.21758329987956643, "frac_reward_zero_std": 1.0, "grad_norm": 0.019547004289054374, "kl": 0.0010585784912109375, "learning_rate": 9.628542334191663e-07, "loss": 0.0, "num_tokens": 2873788.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 357.25, "completions/mean_terminated_length": 357.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.21785093001471967, "frac_reward_zero_std": 1.0, "grad_norm": 0.014451619840041155, "kl": 0.0011920928955078125, "learning_rate": 9.62686810904762e-07, "loss": 0.0, "num_tokens": 2877882.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.21811856014987288, "frac_reward_zero_std": 1.0, "grad_norm": 0.012721004232530834, "kl": 0.0006875991821289062, "learning_rate": 9.625190282543313e-07, "loss": 0.0, "num_tokens": 2881051.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 431.875, "completions/mean_terminated_length": 431.875, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.2183861902850261, "frac_reward_zero_std": 0.5, "grad_norm": 1.3188757778419216, "kl": 0.00112152099609375, "learning_rate": 9.623508856142913e-07, "loss": -0.0095, "num_tokens": 2885518.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 282.875, "completions/mean_terminated_length": 282.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2186538204201793, "frac_reward_zero_std": 1.0, "grad_norm": 0.039929253510675916, "kl": 0.00394439697265625, "learning_rate": 9.62182383131374e-07, "loss": 0.0002, "num_tokens": 2888921.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 337.625, "completions/mean_terminated_length": 337.625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.21892145055533252, "frac_reward_zero_std": 1.0, "grad_norm": 0.009618347038566866, "kl": 0.0006008148193359375, "learning_rate": 9.62013520952625e-07, "loss": 0.0, "num_tokens": 2892566.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.21918908069048576, "frac_reward_zero_std": 1.0, "grad_norm": 0.009908751717750764, "kl": 0.0005397796630859375, "learning_rate": 9.618442992254039e-07, "loss": 0.0, "num_tokens": 2896238.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.21945671082563897, "frac_reward_zero_std": 1.0, "grad_norm": 0.018261749991776106, "kl": 0.00083160400390625, "learning_rate": 9.616747180973838e-07, "loss": 0.0, "num_tokens": 2899696.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.21972434096079219, "frac_reward_zero_std": 1.0, "grad_norm": 0.0196948834571224, "kl": 0.001331329345703125, "learning_rate": 9.615047777165523e-07, "loss": 0.0001, "num_tokens": 2903940.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 262.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.2199919710959454, "frac_reward_zero_std": 0.5, "grad_norm": 1.1761791693725865, "kl": 0.01078033447265625, "learning_rate": 9.613344782312092e-07, "loss": -0.0163, "num_tokens": 2907368.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2202596012310986, "frac_reward_zero_std": 1.0, "grad_norm": 0.015619490057231705, "kl": 0.0008678436279296875, "learning_rate": 9.611638197899687e-07, "loss": 0.0, "num_tokens": 2909984.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.22052723136625185, "frac_reward_zero_std": 1.0, "grad_norm": 0.018412214557292295, "kl": 0.002166748046875, "learning_rate": 9.609928025417578e-07, "loss": 0.0001, "num_tokens": 2913883.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.22079486150140507, "frac_reward_zero_std": 1.0, "grad_norm": 0.02775894270268847, "kl": 0.001953125, "learning_rate": 9.60821426635817e-07, "loss": 0.0001, "num_tokens": 2917452.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.22106249163655828, "frac_reward_zero_std": 1.0, "grad_norm": 0.037782114024442426, "kl": 0.0023956298828125, "learning_rate": 9.60649692221699e-07, "loss": 0.0001, "num_tokens": 2920583.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 388.125, "completions/mean_terminated_length": 388.125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.2213301217717115, "frac_reward_zero_std": 1.0, "grad_norm": 0.020176847629766584, "kl": 0.001399993896484375, "learning_rate": 9.604775994492702e-07, "loss": 0.0001, "num_tokens": 2924792.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 266.25, "completions/mean_terminated_length": 266.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.2215977519068647, "frac_reward_zero_std": 1.0, "grad_norm": 0.05184848333621495, "kl": 0.005401611328125, "learning_rate": 9.603051484687095e-07, "loss": 0.0002, "num_tokens": 2927874.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 360.5, "completions/mean_terminated_length": 360.5, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.22186538204201794, "frac_reward_zero_std": 1.0, "grad_norm": 0.015657934558957802, "kl": 0.0019989013671875, "learning_rate": 9.601323394305081e-07, "loss": 0.0001, "num_tokens": 2931978.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.22213301217717116, "frac_reward_zero_std": 1.0, "grad_norm": 0.08046233412592003, "kl": 0.00428009033203125, "learning_rate": 9.599591724854698e-07, "loss": 0.0002, "num_tokens": 2935449.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.22240064231232437, "frac_reward_zero_std": 1.0, "grad_norm": 0.011813142811304191, "kl": 0.00063323974609375, "learning_rate": 9.597856477847108e-07, "loss": 0.0, "num_tokens": 2939125.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.22266827244747758, "frac_reward_zero_std": 1.0, "grad_norm": 0.019455005888623603, "kl": 0.0013141632080078125, "learning_rate": 9.596117654796598e-07, "loss": 0.0001, "num_tokens": 2942646.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2229359025826308, "frac_reward_zero_std": 0.5, "grad_norm": 1.4584718373113172, "kl": 0.0054473876953125, "learning_rate": 9.594375257220568e-07, "loss": 0.0852, "num_tokens": 2946107.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.223203532717784, "frac_reward_zero_std": 0.5, "grad_norm": 1.170078483087122, "kl": 0.00359344482421875, "learning_rate": 9.592629286639545e-07, "loss": -0.0413, "num_tokens": 2949883.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 414.75, "completions/mean_terminated_length": 414.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.22347116285293725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0257936234203661, "kl": 0.00292205810546875, "learning_rate": 9.590879744577168e-07, "loss": 0.0001, "num_tokens": 2954721.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.22373879298809046, "frac_reward_zero_std": 1.0, "grad_norm": 0.027695692132419573, "kl": 0.001979827880859375, "learning_rate": 9.589126632560199e-07, "loss": 0.0001, "num_tokens": 2958216.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 436.25, "completions/mean_terminated_length": 436.25, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.22400642312324368, "frac_reward_zero_std": 0.5, "grad_norm": 0.7388275974691793, "kl": 0.001216888427734375, "learning_rate": 9.587369952118509e-07, "loss": 0.0011, "num_tokens": 2962890.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 414.5, "completions/mean_terminated_length": 414.5, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.2242740532583969, "frac_reward_zero_std": 1.0, "grad_norm": 0.012931344420191579, "kl": 0.000762939453125, "learning_rate": 9.585609704785087e-07, "loss": 0.0, "num_tokens": 2967174.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.2245416833935501, "frac_reward_zero_std": 1.0, "grad_norm": 0.013107516579175733, "kl": 0.000965118408203125, "learning_rate": 9.583845892096037e-07, "loss": 0.0, "num_tokens": 2970542.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 379.375, "completions/mean_terminated_length": 379.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.22480931352870334, "frac_reward_zero_std": 1.0, "grad_norm": 0.022267690048495357, "kl": 0.00141143798828125, "learning_rate": 9.582078515590563e-07, "loss": 0.0001, "num_tokens": 2974613.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 350.875, "completions/mean_terminated_length": 350.875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.22507694366385655, "frac_reward_zero_std": 0.5, "grad_norm": 1.2858212985915634, "kl": 0.00341033935546875, "learning_rate": 9.580307576810997e-07, "loss": -0.0513, "num_tokens": 2978404.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.22534457379900977, "frac_reward_zero_std": 1.0, "grad_norm": 0.01637921898459646, "kl": 0.0012760162353515625, "learning_rate": 9.578533077302761e-07, "loss": 0.0001, "num_tokens": 2982287.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.22561220393416298, "frac_reward_zero_std": 1.0, "grad_norm": 0.009410401280554336, "kl": 0.000812530517578125, "learning_rate": 9.576755018614398e-07, "loss": 0.0, "num_tokens": 2986724.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 350.75, "completions/mean_terminated_length": 350.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.2258798340693162, "frac_reward_zero_std": 0.5, "grad_norm": 0.6424237680280437, "kl": 0.00165557861328125, "learning_rate": 9.574973402297548e-07, "loss": 0.0003, "num_tokens": 2990578.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 385.5, "completions/mean_terminated_length": 385.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.22614746420446943, "frac_reward_zero_std": 1.0, "grad_norm": 0.013556506886551273, "kl": 0.000736236572265625, "learning_rate": 9.573188229906965e-07, "loss": 0.0, "num_tokens": 2994774.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 201.25, "completions/mean_terminated_length": 201.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.22641509433962265, "frac_reward_zero_std": 1.0, "grad_norm": 0.1348597859317355, "kl": 0.0045928955078125, "learning_rate": 9.571399503000498e-07, "loss": 0.0002, "num_tokens": 2997236.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.22668272447477586, "frac_reward_zero_std": 0.5, "grad_norm": 0.9161158394316918, "kl": 0.00048828125, "learning_rate": 9.5696072231391e-07, "loss": -0.0204, "num_tokens": 3000979.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 352.25, "completions/mean_terminated_length": 352.25, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.22695035460992907, "frac_reward_zero_std": 0.5, "grad_norm": 0.7366145773877063, "kl": 0.00116729736328125, "learning_rate": 9.567811391886825e-07, "loss": -0.0052, "num_tokens": 3004981.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.22721798474508229, "frac_reward_zero_std": 1.0, "grad_norm": 0.017776752007299588, "kl": 0.00110626220703125, "learning_rate": 9.566012010810825e-07, "loss": 0.0, "num_tokens": 3007892.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.22748561488023553, "frac_reward_zero_std": 1.0, "grad_norm": 0.019335087386494896, "kl": 0.000911712646484375, "learning_rate": 9.564209081481356e-07, "loss": 0.0, "num_tokens": 3011491.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.22775324501538874, "frac_reward_zero_std": 1.0, "grad_norm": 0.04824800406216737, "kl": 0.001171112060546875, "learning_rate": 9.562402605471763e-07, "loss": 0.0, "num_tokens": 3015013.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.22802087515054195, "frac_reward_zero_std": 1.0, "grad_norm": 0.03373608919911274, "kl": 0.00133514404296875, "learning_rate": 9.560592584358487e-07, "loss": 0.0001, "num_tokens": 3018791.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.22828850528569516, "frac_reward_zero_std": 1.0, "grad_norm": 0.022062909759769583, "kl": 0.00099945068359375, "learning_rate": 9.558779019721067e-07, "loss": 0.0, "num_tokens": 3022214.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 473.5, "completions/mean_terminated_length": 473.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.22855613542084838, "frac_reward_zero_std": 0.5, "grad_norm": 0.7450794804550676, "kl": 0.0008697509765625, "learning_rate": 9.556961913142129e-07, "loss": 0.0193, "num_tokens": 3027218.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.22882376555600162, "frac_reward_zero_std": 1.0, "grad_norm": 0.013191208213216942, "kl": 0.0006465911865234375, "learning_rate": 9.555141266207396e-07, "loss": 0.0, "num_tokens": 3029924.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 316.875, "completions/mean_terminated_length": 316.875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.22909139569115483, "frac_reward_zero_std": 1.0, "grad_norm": 0.013533762103238832, "kl": 0.00091552734375, "learning_rate": 9.553317080505675e-07, "loss": 0.0, "num_tokens": 3033507.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 367.375, "completions/mean_terminated_length": 367.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.22935902582630804, "frac_reward_zero_std": 1.0, "grad_norm": 0.019890965122813638, "kl": 0.00164794921875, "learning_rate": 9.551489357628863e-07, "loss": 0.0001, "num_tokens": 3037582.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.22962665596146126, "frac_reward_zero_std": 1.0, "grad_norm": 0.009069658752334312, "kl": 0.0006504058837890625, "learning_rate": 9.549658099171945e-07, "loss": 0.0, "num_tokens": 3041309.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 211.0, "completions/max_terminated_length": 211.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.22989428609661447, "frac_reward_zero_std": 1.0, "grad_norm": 0.022788198121781688, "kl": 0.0014934539794921875, "learning_rate": 9.547823306732989e-07, "loss": 0.0001, "num_tokens": 3043759.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.2301619162317677, "frac_reward_zero_std": 1.0, "grad_norm": 0.013627574948127889, "kl": 0.00086212158203125, "learning_rate": 9.545984981913149e-07, "loss": 0.0, "num_tokens": 3047117.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.23042954636692092, "frac_reward_zero_std": 1.0, "grad_norm": 0.010740145062336124, "kl": 0.0006046295166015625, "learning_rate": 9.54414312631666e-07, "loss": 0.0, "num_tokens": 3050192.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.23069717650207414, "frac_reward_zero_std": 1.0, "grad_norm": 0.011661858541937672, "kl": 0.0008335113525390625, "learning_rate": 9.54229774155084e-07, "loss": 0.0, "num_tokens": 3054025.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.23096480663722735, "frac_reward_zero_std": 0.5, "grad_norm": 0.846570390768745, "kl": 0.000736236572265625, "learning_rate": 9.540448829226087e-07, "loss": -0.0183, "num_tokens": 3057548.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.23123243677238056, "frac_reward_zero_std": 1.0, "grad_norm": 0.015339323536951473, "kl": 0.000942230224609375, "learning_rate": 9.538596390955876e-07, "loss": 0.0, "num_tokens": 3061327.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.23150006690753377, "frac_reward_zero_std": 1.0, "grad_norm": 0.01080644472277767, "kl": 0.0007686614990234375, "learning_rate": 9.536740428356757e-07, "loss": 0.0, "num_tokens": 3065240.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 370.5, "completions/mean_terminated_length": 370.5, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.23176769704268702, "frac_reward_zero_std": 1.0, "grad_norm": 0.010487735568137231, "kl": 0.000957489013671875, "learning_rate": 9.534880943048358e-07, "loss": 0.0, "num_tokens": 3069304.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 325.0, "completions/mean_terminated_length": 325.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.23203532717784023, "frac_reward_zero_std": 1.0, "grad_norm": 0.011260987416255247, "kl": 0.0010471343994140625, "learning_rate": 9.533017936653383e-07, "loss": 0.0, "num_tokens": 3072980.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.23230295731299344, "frac_reward_zero_std": 1.0, "grad_norm": 0.01161055147298933, "kl": 0.0005340576171875, "learning_rate": 9.531151410797604e-07, "loss": 0.0, "num_tokens": 3076658.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.23257058744814665, "frac_reward_zero_std": 1.0, "grad_norm": 0.01564278032073378, "kl": 0.0007343292236328125, "learning_rate": 9.529281367109871e-07, "loss": 0.0, "num_tokens": 3080195.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 221.875, "completions/mean_terminated_length": 221.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.23283821758329987, "frac_reward_zero_std": 1.0, "grad_norm": 0.016185073252226012, "kl": 0.0010890960693359375, "learning_rate": 9.527407807222094e-07, "loss": 0.0, "num_tokens": 3082814.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 299.625, "completions/mean_terminated_length": 299.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.2331058477184531, "frac_reward_zero_std": 1.0, "grad_norm": 0.016942684158101178, "kl": 0.001026153564453125, "learning_rate": 9.525530732769262e-07, "loss": 0.0, "num_tokens": 3086211.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.23337347785360632, "frac_reward_zero_std": 1.0, "grad_norm": 0.024628220854449188, "kl": 0.001556396484375, "learning_rate": 9.523650145389423e-07, "loss": 0.0001, "num_tokens": 3090033.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.23364110798875953, "frac_reward_zero_std": 1.0, "grad_norm": 0.02138954990269778, "kl": 0.0008087158203125, "learning_rate": 9.521766046723697e-07, "loss": 0.0, "num_tokens": 3093454.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 274.875, "completions/mean_terminated_length": 274.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.23390873812391275, "frac_reward_zero_std": 1.0, "grad_norm": 0.009548055657092411, "kl": 0.0007476806640625, "learning_rate": 9.519878438416261e-07, "loss": 0.0, "num_tokens": 3096529.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 409.75, "completions/mean_terminated_length": 409.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.23417636825906596, "frac_reward_zero_std": 0.5, "grad_norm": 0.5884397051230462, "kl": 0.0007114410400390625, "learning_rate": 9.517987322114363e-07, "loss": 0.0, "num_tokens": 3101083.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.2344439983942192, "frac_reward_zero_std": 1.0, "grad_norm": 0.022958157100504542, "kl": 0.00176239013671875, "learning_rate": 9.516092699468303e-07, "loss": 0.0001, "num_tokens": 3104552.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.2347116285293724, "frac_reward_zero_std": 1.0, "grad_norm": 0.011676672570918529, "kl": 0.000579833984375, "learning_rate": 9.514194572131452e-07, "loss": 0.0, "num_tokens": 3107437.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.23497925866452563, "frac_reward_zero_std": 1.0, "grad_norm": 0.016684198166346524, "kl": 0.000762939453125, "learning_rate": 9.512292941760228e-07, "loss": 0.0, "num_tokens": 3110398.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.23524688879967884, "frac_reward_zero_std": 0.5, "grad_norm": 0.7464240854798877, "kl": 0.0008544921875, "learning_rate": 9.510387810014114e-07, "loss": -0.005, "num_tokens": 3113888.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 373.5, "completions/mean_terminated_length": 373.5, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.23551451893483205, "frac_reward_zero_std": 1.0, "grad_norm": 0.01426991282684869, "kl": 0.0008907318115234375, "learning_rate": 9.508479178555645e-07, "loss": 0.0, "num_tokens": 3117896.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 237.0, "completions/mean_terminated_length": 237.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.2357821490699853, "frac_reward_zero_std": 1.0, "grad_norm": 0.011794749184321723, "kl": 0.000713348388671875, "learning_rate": 9.506567049050412e-07, "loss": 0.0, "num_tokens": 3120708.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.2360497792051385, "frac_reward_zero_std": 0.5, "grad_norm": 0.7223709484441004, "kl": 0.00182342529296875, "learning_rate": 9.504651423167054e-07, "loss": 0.0197, "num_tokens": 3123972.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 378.125, "completions/mean_terminated_length": 378.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.23631740934029172, "frac_reward_zero_std": 1.0, "grad_norm": 0.008721602974822358, "kl": 0.000614166259765625, "learning_rate": 9.502732302577267e-07, "loss": 0.0, "num_tokens": 3128129.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 283.25, "completions/mean_terminated_length": 283.25, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.23658503947544493, "frac_reward_zero_std": 1.0, "grad_norm": 0.0667826028757007, "kl": 0.00150299072265625, "learning_rate": 9.500809688955797e-07, "loss": 0.0001, "num_tokens": 3131479.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 295.75, "completions/mean_terminated_length": 295.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.23685266961059814, "frac_reward_zero_std": 1.0, "grad_norm": 0.014677690937256847, "kl": 0.0009288787841796875, "learning_rate": 9.498883583980429e-07, "loss": 0.0, "num_tokens": 3134785.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 281.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.23712029974575138, "frac_reward_zero_std": 1.0, "grad_norm": 0.013609847265281997, "kl": 0.0005512237548828125, "learning_rate": 9.496953989332006e-07, "loss": 0.0, "num_tokens": 3138030.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 244.625, "completions/mean_terminated_length": 244.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2373879298809046, "frac_reward_zero_std": 1.0, "grad_norm": 0.010932014729787559, "kl": 0.000545501708984375, "learning_rate": 9.495020906694412e-07, "loss": 0.0, "num_tokens": 3140991.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2376555600160578, "frac_reward_zero_std": 1.0, "grad_norm": 0.017055552609287365, "kl": 0.0010662078857421875, "learning_rate": 9.493084337754571e-07, "loss": 0.0, "num_tokens": 3144088.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.23792319015121102, "frac_reward_zero_std": 1.0, "grad_norm": 0.04002160053093168, "kl": 0.00176239013671875, "learning_rate": 9.491144284202456e-07, "loss": 0.0001, "num_tokens": 3148281.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.23819082028636424, "frac_reward_zero_std": 1.0, "grad_norm": 0.022376268667067715, "kl": 0.00099945068359375, "learning_rate": 9.489200747731076e-07, "loss": 0.0, "num_tokens": 3151156.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.23845845042151748, "frac_reward_zero_std": 1.0, "grad_norm": 0.008953344296869815, "kl": 0.000568389892578125, "learning_rate": 9.487253730036482e-07, "loss": 0.0, "num_tokens": 3154549.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.2387260805566707, "frac_reward_zero_std": 1.0, "grad_norm": 0.01512396724972156, "kl": 0.00087738037109375, "learning_rate": 9.485303232817761e-07, "loss": 0.0, "num_tokens": 3158060.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 349.875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.2389937106918239, "frac_reward_zero_std": 1.0, "grad_norm": 0.009704130521102274, "kl": 0.0005702972412109375, "learning_rate": 9.483349257777038e-07, "loss": 0.0, "num_tokens": 3162027.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.23926134082697711, "frac_reward_zero_std": 1.0, "grad_norm": 0.015129332341886667, "kl": 0.001094818115234375, "learning_rate": 9.481391806619474e-07, "loss": 0.0, "num_tokens": 3165627.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 247.875, "completions/mean_terminated_length": 247.875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.23952897096213033, "frac_reward_zero_std": 1.0, "grad_norm": 0.028207418708313156, "kl": 0.00127410888671875, "learning_rate": 9.479430881053257e-07, "loss": 0.0001, "num_tokens": 3168626.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 329.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.23979660109728354, "frac_reward_zero_std": 1.0, "grad_norm": 0.022879257742829846, "kl": 0.000995635986328125, "learning_rate": 9.477466482789618e-07, "loss": 0.0, "num_tokens": 3172514.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.24006423123243678, "frac_reward_zero_std": 1.0, "grad_norm": 0.009866288173696478, "kl": 0.0009098052978515625, "learning_rate": 9.475498613542807e-07, "loss": 0.0, "num_tokens": 3176060.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 332.375, "completions/mean_terminated_length": 332.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.24033186136759, "frac_reward_zero_std": 1.0, "grad_norm": 0.02229153755810419, "kl": 0.00058746337890625, "learning_rate": 9.473527275030114e-07, "loss": 0.0, "num_tokens": 3179859.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.2405994915027432, "frac_reward_zero_std": 1.0, "grad_norm": 0.013521160063723482, "kl": 0.001033782958984375, "learning_rate": 9.471552468971846e-07, "loss": 0.0, "num_tokens": 3183002.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.24086712163789642, "frac_reward_zero_std": 0.5, "grad_norm": 0.9686868924151038, "kl": 0.0006237030029296875, "learning_rate": 9.469574197091343e-07, "loss": 0.0469, "num_tokens": 3186520.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.24113475177304963, "frac_reward_zero_std": 1.0, "grad_norm": 0.012361627787389846, "kl": 0.0006866455078125, "learning_rate": 9.467592461114967e-07, "loss": 0.0, "num_tokens": 3189417.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 330.75, "completions/mean_terminated_length": 330.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.24140238190820287, "frac_reward_zero_std": 1.0, "grad_norm": 0.03733408627640366, "kl": 0.0010967254638671875, "learning_rate": 9.465607262772104e-07, "loss": 0.0, "num_tokens": 3193047.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.24167001204335609, "frac_reward_zero_std": 1.0, "grad_norm": 0.009295407835935103, "kl": 0.000644683837890625, "learning_rate": 9.46361860379516e-07, "loss": 0.0, "num_tokens": 3195718.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.2419376421785093, "frac_reward_zero_std": 1.0, "grad_norm": 0.010637898554668903, "kl": 0.0007801055908203125, "learning_rate": 9.461626485919561e-07, "loss": 0.0, "num_tokens": 3199432.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 326.75, "completions/mean_terminated_length": 326.75, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.2422052723136625, "frac_reward_zero_std": 1.0, "grad_norm": 0.013863227386898838, "kl": 0.001018524169921875, "learning_rate": 9.459630910883754e-07, "loss": 0.0, "num_tokens": 3203114.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.24247290244881572, "frac_reward_zero_std": 1.0, "grad_norm": 0.012154399268656845, "kl": 0.000823974609375, "learning_rate": 9.4576318804292e-07, "loss": 0.0, "num_tokens": 3206294.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.24274053258396897, "frac_reward_zero_std": 1.0, "grad_norm": 0.015192408781344027, "kl": 0.000850677490234375, "learning_rate": 9.455629396300378e-07, "loss": 0.0, "num_tokens": 3210241.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 343.625, "completions/mean_terminated_length": 343.625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.24300816271912218, "frac_reward_zero_std": 1.0, "grad_norm": 0.009232702113443144, "kl": 0.0007572174072265625, "learning_rate": 9.453623460244777e-07, "loss": 0.0, "num_tokens": 3214070.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 347.0, "completions/mean_terminated_length": 347.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2432757928542754, "frac_reward_zero_std": 1.0, "grad_norm": 0.018879376669719398, "kl": 0.0008544921875, "learning_rate": 9.451614074012904e-07, "loss": 0.0, "num_tokens": 3217998.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.2435434229894286, "frac_reward_zero_std": 0.5, "grad_norm": 0.7524927181377653, "kl": 0.001323699951171875, "learning_rate": 9.44960123935827e-07, "loss": 0.0073, "num_tokens": 3221940.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 349.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.24381105312458182, "frac_reward_zero_std": 1.0, "grad_norm": 0.02402494894439438, "kl": 0.0009136199951171875, "learning_rate": 9.447584958037401e-07, "loss": 0.0, "num_tokens": 3225855.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.24407868325973506, "frac_reward_zero_std": 1.0, "grad_norm": 0.009219285053430388, "kl": 0.00042438507080078125, "learning_rate": 9.44556523180983e-07, "loss": 0.0, "num_tokens": 3229268.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.24434631339488827, "frac_reward_zero_std": 1.0, "grad_norm": 0.02925003370235622, "kl": 0.001171112060546875, "learning_rate": 9.443542062438091e-07, "loss": 0.0, "num_tokens": 3232736.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.24461394353004148, "frac_reward_zero_std": 1.0, "grad_norm": 0.009399746415032521, "kl": 0.0005178451538085938, "learning_rate": 9.441515451687732e-07, "loss": 0.0, "num_tokens": 3236059.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 352.25, "completions/mean_terminated_length": 352.25, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2448815736651947, "frac_reward_zero_std": 0.5, "grad_norm": 0.8516203587272944, "kl": 0.0008487701416015625, "learning_rate": 9.439485401327295e-07, "loss": 0.1007, "num_tokens": 3239965.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 210.125, "completions/mean_terminated_length": 210.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.2451492038003479, "frac_reward_zero_std": 1.0, "grad_norm": 0.08698970460657421, "kl": 0.00225830078125, "learning_rate": 9.437451913128328e-07, "loss": 0.0001, "num_tokens": 3242550.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.24541683393550115, "frac_reward_zero_std": 1.0, "grad_norm": 0.05555413362833559, "kl": 0.00171661376953125, "learning_rate": 9.43541498886538e-07, "loss": 0.0001, "num_tokens": 3245851.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.24568446407065436, "frac_reward_zero_std": 1.0, "grad_norm": 0.008448686104204012, "kl": 0.000507354736328125, "learning_rate": 9.433374630315996e-07, "loss": 0.0, "num_tokens": 3248864.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.24595209420580758, "frac_reward_zero_std": 1.0, "grad_norm": 0.00899794721819318, "kl": 0.0006561279296875, "learning_rate": 9.431330839260718e-07, "loss": 0.0, "num_tokens": 3252180.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 226.0, "completions/max_terminated_length": 226.0, "completions/mean_length": 208.0, "completions/mean_terminated_length": 208.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.2462197243409608, "frac_reward_zero_std": 1.0, "grad_norm": 0.01012834280175967, "kl": 0.0004987716674804688, "learning_rate": 9.429283617483087e-07, "loss": 0.0, "num_tokens": 3254792.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.246487354476114, "frac_reward_zero_std": 1.0, "grad_norm": 0.04761613360087681, "kl": 0.0013275146484375, "learning_rate": 9.427232966769632e-07, "loss": 0.0001, "num_tokens": 3258312.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 279.375, "completions/mean_terminated_length": 279.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.24675498461126724, "frac_reward_zero_std": 1.0, "grad_norm": 0.02014935270133192, "kl": 0.001079559326171875, "learning_rate": 9.425178888909882e-07, "loss": 0.0, "num_tokens": 3261535.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 366.125, "completions/mean_terminated_length": 366.125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.24702261474642045, "frac_reward_zero_std": 1.0, "grad_norm": 0.02738707783094104, "kl": 0.001041412353515625, "learning_rate": 9.423121385696349e-07, "loss": 0.0, "num_tokens": 3265728.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.24729024488157367, "frac_reward_zero_std": 1.0, "grad_norm": 0.02062127406525, "kl": 0.0010223388671875, "learning_rate": 9.421060458924537e-07, "loss": 0.0, "num_tokens": 3268555.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 383.0, "completions/mean_terminated_length": 383.0, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.24755787501672688, "frac_reward_zero_std": 1.0, "grad_norm": 0.01381707290090874, "kl": 0.001155853271484375, "learning_rate": 9.418996110392941e-07, "loss": 0.0, "num_tokens": 3272719.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 344.375, "completions/mean_terminated_length": 344.375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.2478255051518801, "frac_reward_zero_std": 1.0, "grad_norm": 0.03958191568757048, "kl": 0.0014801025390625, "learning_rate": 9.416928341903036e-07, "loss": 0.0001, "num_tokens": 3276630.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 345.625, "completions/mean_terminated_length": 345.625, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.2480931352870333, "frac_reward_zero_std": 1.0, "grad_norm": 0.0073031086993248714, "kl": 0.00046253204345703125, "learning_rate": 9.414857155259289e-07, "loss": 0.0, "num_tokens": 3280435.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 541.25, "completions/mean_terminated_length": 541.25, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.24836076542218655, "frac_reward_zero_std": 1.0, "grad_norm": 0.009398686162653056, "kl": 0.0006561279296875, "learning_rate": 9.41278255226914e-07, "loss": 0.0, "num_tokens": 3286193.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 359.375, "completions/mean_terminated_length": 359.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.24862839555733976, "frac_reward_zero_std": 1.0, "grad_norm": 0.013615634572359411, "kl": 0.000701904296875, "learning_rate": 9.410704534743019e-07, "loss": 0.0, "num_tokens": 3290168.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.24889602569249297, "frac_reward_zero_std": 0.5, "grad_norm": 0.5866779847176611, "kl": 0.0013885498046875, "learning_rate": 9.408623104494334e-07, "loss": 0.0001, "num_tokens": 3293162.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.24916365582764619, "frac_reward_zero_std": 1.0, "grad_norm": 0.014455119946841791, "kl": 0.001247406005859375, "learning_rate": 9.406538263339465e-07, "loss": 0.0001, "num_tokens": 3296926.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 372.75, "completions/mean_terminated_length": 372.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.2494312859627994, "frac_reward_zero_std": 1.0, "grad_norm": 0.00866088434923776, "kl": 0.0005893707275390625, "learning_rate": 9.404450013097778e-07, "loss": 0.0, "num_tokens": 3300984.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 353.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.24969891609795264, "frac_reward_zero_std": 0.5, "grad_norm": 0.8539657125868704, "kl": 0.0006160736083984375, "learning_rate": 9.402358355591607e-07, "loss": 0.0571, "num_tokens": 3304838.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.24996654623310585, "frac_reward_zero_std": 0.5, "grad_norm": 0.8905828246637938, "kl": 0.001438140869140625, "learning_rate": 9.400263292646263e-07, "loss": 0.0001, "num_tokens": 3308163.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.25023417636825906, "frac_reward_zero_std": 1.0, "grad_norm": 0.013428206697052546, "kl": 0.001522064208984375, "learning_rate": 9.398164826090027e-07, "loss": 0.0001, "num_tokens": 3312075.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 438.125, "completions/mean_terminated_length": 438.125, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.2505018065034123, "frac_reward_zero_std": 0.0, "grad_norm": 0.9301655133729579, "kl": 0.0009479522705078125, "learning_rate": 9.39606295775415e-07, "loss": -0.0342, "num_tokens": 3316748.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.2507694366385655, "frac_reward_zero_std": 1.0, "grad_norm": 0.020584847014140817, "kl": 0.000659942626953125, "learning_rate": 9.39395768947285e-07, "loss": 0.0, "num_tokens": 3319717.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.25103706677371873, "frac_reward_zero_std": 1.0, "grad_norm": 0.020520810525842825, "kl": 0.001129150390625, "learning_rate": 9.391849023083319e-07, "loss": 0.0, "num_tokens": 3322833.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.2513046969088719, "frac_reward_zero_std": 1.0, "grad_norm": 0.03073713865049338, "kl": 0.0010662078857421875, "learning_rate": 9.389736960425707e-07, "loss": 0.0, "num_tokens": 3326085.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 356.25, "completions/mean_terminated_length": 356.25, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.25157232704402516, "frac_reward_zero_std": 1.0, "grad_norm": 0.015993913171491233, "kl": 0.0009975433349609375, "learning_rate": 9.38762150334313e-07, "loss": 0.0, "num_tokens": 3330159.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2518399571791784, "frac_reward_zero_std": 1.0, "grad_norm": 0.047748986954690945, "kl": 0.0015106201171875, "learning_rate": 9.385502653681669e-07, "loss": 0.0001, "num_tokens": 3333076.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 289.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.2521075873143316, "frac_reward_zero_std": 1.0, "grad_norm": 0.012002648511271646, "kl": 0.0007648468017578125, "learning_rate": 9.38338041329036e-07, "loss": 0.0, "num_tokens": 3336356.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 469.0, "completions/mean_terminated_length": 469.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.2523752174494848, "frac_reward_zero_std": 0.5, "grad_norm": 0.539401453880737, "kl": 0.0008487701416015625, "learning_rate": 9.381254784021207e-07, "loss": 0.0487, "num_tokens": 3341048.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.252642847584638, "frac_reward_zero_std": 1.0, "grad_norm": 0.011650921899170896, "kl": 0.000759124755859375, "learning_rate": 9.379125767729157e-07, "loss": 0.0, "num_tokens": 3344351.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 215.875, "completions/mean_terminated_length": 215.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.25291047771979125, "frac_reward_zero_std": 1.0, "grad_norm": 0.01585740361103308, "kl": 0.00102996826171875, "learning_rate": 9.376993366272127e-07, "loss": 0.0, "num_tokens": 3346982.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.2531781078549445, "frac_reward_zero_std": 1.0, "grad_norm": 0.01720180001730624, "kl": 0.0011119842529296875, "learning_rate": 9.374857581510982e-07, "loss": 0.0, "num_tokens": 3350025.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2534457379900977, "frac_reward_zero_std": 1.0, "grad_norm": 0.01312779122909294, "kl": 0.0009632110595703125, "learning_rate": 9.372718415309539e-07, "loss": 0.0, "num_tokens": 3353838.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.2537133681252509, "frac_reward_zero_std": 1.0, "grad_norm": 0.008800403931586697, "kl": 0.0005397796630859375, "learning_rate": 9.370575869534564e-07, "loss": 0.0, "num_tokens": 3356673.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.2539809982604041, "frac_reward_zero_std": 1.0, "grad_norm": 0.014147540831380277, "kl": 0.001033782958984375, "learning_rate": 9.368429946055779e-07, "loss": 0.0, "num_tokens": 3359570.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 438.625, "completions/mean_terminated_length": 438.625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.25424862839555734, "frac_reward_zero_std": 0.5, "grad_norm": 0.6522862096830974, "kl": 0.0009307861328125, "learning_rate": 9.36628064674585e-07, "loss": -0.014, "num_tokens": 3364415.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 334.25, "completions/mean_terminated_length": 334.25, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.2545162585307106, "frac_reward_zero_std": 1.0, "grad_norm": 0.013339636544842047, "kl": 0.000904083251953125, "learning_rate": 9.364127973480383e-07, "loss": 0.0, "num_tokens": 3368193.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.25478388866586377, "frac_reward_zero_std": 1.0, "grad_norm": 0.012688395669862849, "kl": 0.0006542205810546875, "learning_rate": 9.361971928137939e-07, "loss": 0.0, "num_tokens": 3371288.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.255051518801017, "frac_reward_zero_std": 1.0, "grad_norm": 0.011458207422380028, "kl": 0.0007915496826171875, "learning_rate": 9.359812512600014e-07, "loss": 0.0, "num_tokens": 3374560.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2553191489361702, "frac_reward_zero_std": 1.0, "grad_norm": 0.024171280279119543, "kl": 0.00157928466796875, "learning_rate": 9.357649728751049e-07, "loss": 0.0001, "num_tokens": 3378441.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.25558677907132343, "frac_reward_zero_std": 1.0, "grad_norm": 0.011112753000628915, "kl": 0.00078582763671875, "learning_rate": 9.355483578478423e-07, "loss": 0.0, "num_tokens": 3381866.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.2558544092064767, "frac_reward_zero_std": 1.0, "grad_norm": 0.012628700770520694, "kl": 0.0007762908935546875, "learning_rate": 9.353314063672451e-07, "loss": 0.0, "num_tokens": 3385284.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 336.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.25612203934162986, "frac_reward_zero_std": 0.5, "grad_norm": 0.7834163367379752, "kl": 0.000946044921875, "learning_rate": 9.351141186226386e-07, "loss": -0.1071, "num_tokens": 3389128.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2563896694767831, "frac_reward_zero_std": 1.0, "grad_norm": 0.018429379796500164, "kl": 0.00077056884765625, "learning_rate": 9.348964948036415e-07, "loss": 0.0, "num_tokens": 3392120.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.2566572996119363, "frac_reward_zero_std": 1.0, "grad_norm": 0.011497945334761488, "kl": 0.0007228851318359375, "learning_rate": 9.346785351001661e-07, "loss": 0.0, "num_tokens": 3395824.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.2569249297470895, "frac_reward_zero_std": 1.0, "grad_norm": 0.009049823920909856, "kl": 0.0006561279296875, "learning_rate": 9.344602397024171e-07, "loss": 0.0, "num_tokens": 3399412.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.25719255988224277, "frac_reward_zero_std": 1.0, "grad_norm": 0.019746750789779024, "kl": 0.0008640289306640625, "learning_rate": 9.342416088008928e-07, "loss": 0.0, "num_tokens": 3402368.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.25746019001739595, "frac_reward_zero_std": 1.0, "grad_norm": 0.010542673427123745, "kl": 0.000911712646484375, "learning_rate": 9.34022642586384e-07, "loss": 0.0, "num_tokens": 3405684.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.2577278201525492, "frac_reward_zero_std": 1.0, "grad_norm": 0.016941807404874692, "kl": 0.001407623291015625, "learning_rate": 9.33803341249974e-07, "loss": 0.0001, "num_tokens": 3408796.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 383.375, "completions/mean_terminated_length": 383.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.2579954502877024, "frac_reward_zero_std": 1.0, "grad_norm": 0.009663725345591657, "kl": 0.000743865966796875, "learning_rate": 9.335837049830391e-07, "loss": 0.0, "num_tokens": 3412915.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.2582630804228556, "frac_reward_zero_std": 1.0, "grad_norm": 0.016417214319137008, "kl": 0.00144195556640625, "learning_rate": 9.333637339772471e-07, "loss": 0.0001, "num_tokens": 3416356.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 333.75, "completions/mean_terminated_length": 333.75, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.25853071055800886, "frac_reward_zero_std": 1.0, "grad_norm": 0.014442526066722968, "kl": 0.00093841552734375, "learning_rate": 9.331434284245584e-07, "loss": 0.0, "num_tokens": 3420066.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.25879834069316204, "frac_reward_zero_std": 1.0, "grad_norm": 0.010139037956800243, "kl": 0.000736236572265625, "learning_rate": 9.329227885172255e-07, "loss": 0.0, "num_tokens": 3423797.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.2590659708283153, "frac_reward_zero_std": 1.0, "grad_norm": 0.008118463298526403, "kl": 0.000701904296875, "learning_rate": 9.32701814447792e-07, "loss": 0.0, "num_tokens": 3427562.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.25933360096346847, "frac_reward_zero_std": 1.0, "grad_norm": 0.02191085623281072, "kl": 0.000896453857421875, "learning_rate": 9.324805064090938e-07, "loss": 0.0, "num_tokens": 3430665.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 355.25, "completions/mean_terminated_length": 355.25, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.2596012310986217, "frac_reward_zero_std": 1.0, "grad_norm": 0.011092457589213178, "kl": 0.0007839202880859375, "learning_rate": 9.32258864594258e-07, "loss": 0.0, "num_tokens": 3434727.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 214.25, "completions/mean_terminated_length": 214.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.25986886123377495, "frac_reward_zero_std": 1.0, "grad_norm": 0.018090934637235764, "kl": 0.001190185546875, "learning_rate": 9.320368891967028e-07, "loss": 0.0, "num_tokens": 3437413.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.26013649136892814, "frac_reward_zero_std": 1.0, "grad_norm": 0.009983743914914278, "kl": 0.0004253387451171875, "learning_rate": 9.318145804101376e-07, "loss": 0.0, "num_tokens": 3440333.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 366.875, "completions/mean_terminated_length": 366.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.2604041215040814, "frac_reward_zero_std": 1.0, "grad_norm": 0.02692143473119178, "kl": 0.00087738037109375, "learning_rate": 9.315919384285631e-07, "loss": 0.0, "num_tokens": 3444404.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 431.25, "completions/mean_terminated_length": 431.25, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.26067175163923456, "frac_reward_zero_std": 0.5, "grad_norm": 0.7277715602364802, "kl": 0.0034942626953125, "learning_rate": 9.313689634462702e-07, "loss": 0.0173, "num_tokens": 3449098.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 364.875, "completions/mean_terminated_length": 364.875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.2609393817743878, "frac_reward_zero_std": 0.5, "grad_norm": 0.7259431506798377, "kl": 0.0006923675537109375, "learning_rate": 9.311456556578407e-07, "loss": 0.0338, "num_tokens": 3453253.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.261207011909541, "frac_reward_zero_std": 1.0, "grad_norm": 0.025938902581875825, "kl": 0.001110076904296875, "learning_rate": 9.309220152581468e-07, "loss": 0.0, "num_tokens": 3456064.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.2614746420446942, "frac_reward_zero_std": 1.0, "grad_norm": 0.016233430149860978, "kl": 0.0009784698486328125, "learning_rate": 9.30698042442351e-07, "loss": 0.0, "num_tokens": 3459239.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 353.625, "completions/mean_terminated_length": 353.625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.26174227217984747, "frac_reward_zero_std": 0.5, "grad_norm": 1.1973313314015037, "kl": 0.001781463623046875, "learning_rate": 9.30473737405906e-07, "loss": 0.0106, "num_tokens": 3463412.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 289.75, "completions/mean_terminated_length": 289.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.26200990231500065, "frac_reward_zero_std": 0.5, "grad_norm": 1.323705700291855, "kl": 0.001552581787109375, "learning_rate": 9.302491003445539e-07, "loss": 0.0121, "num_tokens": 3466874.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 220.375, "completions/mean_terminated_length": 220.375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2622775324501539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0666432924149557, "kl": 0.0030193328857421875, "learning_rate": 9.300241314543268e-07, "loss": 0.0001, "num_tokens": 3469685.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.2625451625853071, "frac_reward_zero_std": 1.0, "grad_norm": 0.019509127366003697, "kl": 0.0011386871337890625, "learning_rate": 9.297988309315469e-07, "loss": 0.0, "num_tokens": 3473071.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 467.25, "completions/mean_terminated_length": 467.25, "completions/min_length": 377.0, "completions/min_terminated_length": 377.0, "epoch": 0.2628127927204603, "frac_reward_zero_std": 0.5, "grad_norm": 0.828710218233226, "kl": 0.0034942626953125, "learning_rate": 9.29573198972825e-07, "loss": 0.0052, "num_tokens": 3478113.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 410.0, "completions/mean_terminated_length": 410.0, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.26308042285561356, "frac_reward_zero_std": 0.5, "grad_norm": 0.5693586814909444, "kl": 0.0010833740234375, "learning_rate": 9.293472357750618e-07, "loss": -0.0375, "num_tokens": 3482629.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.26334805299076675, "frac_reward_zero_std": 0.5, "grad_norm": 0.7027610661674943, "kl": 0.00176239013671875, "learning_rate": 9.291209415354464e-07, "loss": -0.0298, "num_tokens": 3486115.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 295.25, "completions/mean_terminated_length": 295.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.26361568312592, "frac_reward_zero_std": 1.0, "grad_norm": 0.021466864234385326, "kl": 0.001552581787109375, "learning_rate": 9.288943164514575e-07, "loss": 0.0001, "num_tokens": 3489593.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.26388331326107317, "frac_reward_zero_std": 1.0, "grad_norm": 0.020228532848280488, "kl": 0.00191497802734375, "learning_rate": 9.28667360720862e-07, "loss": 0.0001, "num_tokens": 3493213.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 288.0, "completions/mean_terminated_length": 288.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.2641509433962264, "frac_reward_zero_std": 1.0, "grad_norm": 0.05503648383236704, "kl": 0.00298309326171875, "learning_rate": 9.284400745417153e-07, "loss": 0.0001, "num_tokens": 3496513.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.26441857353137965, "frac_reward_zero_std": 1.0, "grad_norm": 0.04009656909309204, "kl": 0.00185394287109375, "learning_rate": 9.282124581123618e-07, "loss": 0.0001, "num_tokens": 3500563.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.26468620366653284, "frac_reward_zero_std": 1.0, "grad_norm": 0.026130865493594807, "kl": 0.00313568115234375, "learning_rate": 9.279845116314334e-07, "loss": 0.0001, "num_tokens": 3503961.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.2649538338016861, "frac_reward_zero_std": 1.0, "grad_norm": 0.05597670537292202, "kl": 0.00289154052734375, "learning_rate": 9.277562352978503e-07, "loss": 0.0001, "num_tokens": 3507788.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.26522146393683926, "frac_reward_zero_std": 1.0, "grad_norm": 0.014335032753002838, "kl": 0.0008087158203125, "learning_rate": 9.275276293108206e-07, "loss": 0.0, "num_tokens": 3510953.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2654890940719925, "frac_reward_zero_std": 1.0, "grad_norm": 0.021714539813177103, "kl": 0.000957489013671875, "learning_rate": 9.272986938698401e-07, "loss": 0.0, "num_tokens": 3513728.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 373.0, "completions/mean_terminated_length": 373.0, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.26575672420714574, "frac_reward_zero_std": 0.5, "grad_norm": 0.551417615832443, "kl": 0.00206756591796875, "learning_rate": 9.270694291746918e-07, "loss": 0.0197, "num_tokens": 3517792.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.26602435434229893, "frac_reward_zero_std": 1.0, "grad_norm": 0.010897481330352694, "kl": 0.0007648468017578125, "learning_rate": 9.268398354254464e-07, "loss": 0.0, "num_tokens": 3521495.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.26629198447745217, "frac_reward_zero_std": 1.0, "grad_norm": 0.014532411794262326, "kl": 0.0010547637939453125, "learning_rate": 9.266099128224616e-07, "loss": 0.0, "num_tokens": 3524755.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.26655961461260536, "frac_reward_zero_std": 1.0, "grad_norm": 0.010759873149543346, "kl": 0.0007915496826171875, "learning_rate": 9.263796615663819e-07, "loss": 0.0, "num_tokens": 3527611.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.2668272447477586, "frac_reward_zero_std": 1.0, "grad_norm": 0.0177816078612442, "kl": 0.001758575439453125, "learning_rate": 9.26149081858139e-07, "loss": 0.0001, "num_tokens": 3530919.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 307.375, "completions/mean_terminated_length": 307.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.26709487488291184, "frac_reward_zero_std": 0.5, "grad_norm": 1.1273224528140684, "kl": 0.002227783203125, "learning_rate": 9.259181738989512e-07, "loss": 0.0005, "num_tokens": 3534414.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.267362505018065, "frac_reward_zero_std": 0.5, "grad_norm": 1.1208350354041963, "kl": 0.001556396484375, "learning_rate": 9.256869378903225e-07, "loss": 0.0748, "num_tokens": 3538158.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.26763013515321826, "frac_reward_zero_std": 1.0, "grad_norm": 0.01876780998404342, "kl": 0.001392364501953125, "learning_rate": 9.254553740340441e-07, "loss": 0.0001, "num_tokens": 3541879.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.26789776528837145, "frac_reward_zero_std": 0.5, "grad_norm": 0.6104508153610565, "kl": 0.00200653076171875, "learning_rate": 9.252234825321928e-07, "loss": 0.0832, "num_tokens": 3546121.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.2681653954235247, "frac_reward_zero_std": 0.5, "grad_norm": 1.1314576344325202, "kl": 0.004352569580078125, "learning_rate": 9.249912635871317e-07, "loss": -0.078, "num_tokens": 3549292.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.26843302555867793, "frac_reward_zero_std": 0.5, "grad_norm": 0.9308733340077251, "kl": 0.0010528564453125, "learning_rate": 9.247587174015093e-07, "loss": 0.0491, "num_tokens": 3552400.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.2687006556938311, "frac_reward_zero_std": 1.0, "grad_norm": 0.052994493939225173, "kl": 0.00188446044921875, "learning_rate": 9.245258441782595e-07, "loss": 0.0001, "num_tokens": 3556142.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.26896828582898435, "frac_reward_zero_std": 0.5, "grad_norm": 1.094021887540969, "kl": 0.002033233642578125, "learning_rate": 9.242926441206022e-07, "loss": 0.0722, "num_tokens": 3559677.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.26923591596413754, "frac_reward_zero_std": 1.0, "grad_norm": 0.01712447373606913, "kl": 0.001220703125, "learning_rate": 9.240591174320421e-07, "loss": 0.0, "num_tokens": 3563117.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.2695035460992908, "frac_reward_zero_std": 1.0, "grad_norm": 0.014836025976635688, "kl": 0.0007305145263671875, "learning_rate": 9.238252643163689e-07, "loss": 0.0, "num_tokens": 3566366.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.269771176234444, "frac_reward_zero_std": 1.0, "grad_norm": 0.02074687983464681, "kl": 0.00177001953125, "learning_rate": 9.235910849776577e-07, "loss": 0.0001, "num_tokens": 3569662.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 347.75, "completions/mean_terminated_length": 347.75, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.2700388063695972, "frac_reward_zero_std": 1.0, "grad_norm": 0.033485229920705566, "kl": 0.0020294189453125, "learning_rate": 9.233565796202674e-07, "loss": 0.0001, "num_tokens": 3573556.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 330.0, "completions/mean_terminated_length": 330.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.27030643650475045, "frac_reward_zero_std": 1.0, "grad_norm": 0.07853465977086005, "kl": 0.009429931640625, "learning_rate": 9.231217484488419e-07, "loss": 0.0004, "num_tokens": 3577360.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 358.25, "completions/mean_terminated_length": 358.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.27057406663990363, "frac_reward_zero_std": 1.0, "grad_norm": 0.03806038991396361, "kl": 0.00411224365234375, "learning_rate": 9.228865916683098e-07, "loss": 0.0002, "num_tokens": 3581490.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 253.25, "completions/mean_terminated_length": 253.25, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.2708416967750569, "frac_reward_zero_std": 1.0, "grad_norm": 0.04993342147046877, "kl": 0.00496673583984375, "learning_rate": 9.22651109483883e-07, "loss": 0.0002, "num_tokens": 3584524.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.2711093269102101, "frac_reward_zero_std": 1.0, "grad_norm": 0.05677908289828609, "kl": 0.00194549560546875, "learning_rate": 9.22415302101058e-07, "loss": 0.0001, "num_tokens": 3587417.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 339.625, "completions/mean_terminated_length": 339.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.2713769570453633, "frac_reward_zero_std": 0.5, "grad_norm": 0.7695282443350208, "kl": 0.00274658203125, "learning_rate": 9.221791697256151e-07, "loss": -0.024, "num_tokens": 3591398.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.27164458718051654, "frac_reward_zero_std": 1.0, "grad_norm": 0.013790272879109274, "kl": 0.000885009765625, "learning_rate": 9.219427125636177e-07, "loss": 0.0, "num_tokens": 3595178.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.2719122173156697, "frac_reward_zero_std": 1.0, "grad_norm": 0.025750235186461273, "kl": 0.001728057861328125, "learning_rate": 9.21705930821413e-07, "loss": 0.0001, "num_tokens": 3598934.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 295.75, "completions/mean_terminated_length": 295.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.27217984745082296, "frac_reward_zero_std": 1.0, "grad_norm": 0.023916709783951256, "kl": 0.001918792724609375, "learning_rate": 9.214688247056315e-07, "loss": 0.0001, "num_tokens": 3602420.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.2724474775859762, "frac_reward_zero_std": 1.0, "grad_norm": 0.03257033012507831, "kl": 0.00267791748046875, "learning_rate": 9.212313944231867e-07, "loss": 0.0001, "num_tokens": 3605868.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 247.0, "completions/mean_terminated_length": 247.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2727151077211294, "frac_reward_zero_std": 1.0, "grad_norm": 0.010001755208925066, "kl": 0.0006427764892578125, "learning_rate": 9.209936401812745e-07, "loss": 0.0, "num_tokens": 3608784.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.27298273785628263, "frac_reward_zero_std": 1.0, "grad_norm": 0.014896857577294044, "kl": 0.001125335693359375, "learning_rate": 9.207555621873746e-07, "loss": 0.0, "num_tokens": 3612348.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.2732503679914358, "frac_reward_zero_std": 1.0, "grad_norm": 0.0239219180470324, "kl": 0.0022411346435546875, "learning_rate": 9.205171606492481e-07, "loss": 0.0001, "num_tokens": 3615421.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 396.375, "completions/mean_terminated_length": 396.375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.27351799812658906, "frac_reward_zero_std": 1.0, "grad_norm": 0.01375220739901215, "kl": 0.00086212158203125, "learning_rate": 9.202784357749392e-07, "loss": 0.0, "num_tokens": 3619716.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 222.375, "completions/mean_terminated_length": 222.375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.2737856282617423, "frac_reward_zero_std": 1.0, "grad_norm": 0.01272923414181084, "kl": 0.0008716583251953125, "learning_rate": 9.20039387772774e-07, "loss": 0.0, "num_tokens": 3622483.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.2740532583968955, "frac_reward_zero_std": 0.5, "grad_norm": 0.6897634141736502, "kl": 0.00128173828125, "learning_rate": 9.198000168513602e-07, "loss": -0.0842, "num_tokens": 3626618.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 747.0, "completions/max_terminated_length": 747.0, "completions/mean_length": 406.0, "completions/mean_terminated_length": 406.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2743208885320487, "frac_reward_zero_std": 1.0, "grad_norm": 0.019785794437876415, "kl": 0.00176239013671875, "learning_rate": 9.195603232195881e-07, "loss": 0.0001, "num_tokens": 3630938.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.2745885186672019, "frac_reward_zero_std": 1.0, "grad_norm": 0.04611434781421816, "kl": 0.00363922119140625, "learning_rate": 9.193203070866289e-07, "loss": 0.0001, "num_tokens": 3634649.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 479.75, "completions/mean_terminated_length": 402.0000305175781, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.27485614880235515, "frac_reward_zero_std": 0.5, "grad_norm": 0.632470910947033, "kl": 0.00284576416015625, "learning_rate": 9.190799686619357e-07, "loss": 0.1166, "num_tokens": 3639607.0, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 350.375, "completions/mean_terminated_length": 350.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.2751237789375084, "frac_reward_zero_std": 1.0, "grad_norm": 0.031457792451088255, "kl": 0.00298309326171875, "learning_rate": 9.188393081552425e-07, "loss": 0.0001, "num_tokens": 3643610.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.2753914090726616, "frac_reward_zero_std": 1.0, "grad_norm": 0.01084236426520639, "kl": 0.0007953643798828125, "learning_rate": 9.185983257765646e-07, "loss": 0.0, "num_tokens": 3647042.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.2756590392078148, "frac_reward_zero_std": 1.0, "grad_norm": 0.01833574407588512, "kl": 0.0010738372802734375, "learning_rate": 9.183570217361981e-07, "loss": 0.0, "num_tokens": 3651171.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.275926669342968, "frac_reward_zero_std": 1.0, "grad_norm": 0.010215967001358671, "kl": 0.000732421875, "learning_rate": 9.1811539624472e-07, "loss": 0.0, "num_tokens": 3654547.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 370.875, "completions/mean_terminated_length": 370.875, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.27619429947812124, "frac_reward_zero_std": 1.0, "grad_norm": 0.019836633163214004, "kl": 0.00212860107421875, "learning_rate": 9.178734495129874e-07, "loss": 0.0001, "num_tokens": 3658622.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.2764619296132745, "frac_reward_zero_std": 1.0, "grad_norm": 0.016948301766769747, "kl": 0.00127410888671875, "learning_rate": 9.176311817521382e-07, "loss": 0.0001, "num_tokens": 3662440.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.27672955974842767, "frac_reward_zero_std": 1.0, "grad_norm": 0.019986881360525476, "kl": 0.00152587890625, "learning_rate": 9.1738859317359e-07, "loss": 0.0001, "num_tokens": 3666213.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.2769971898835809, "frac_reward_zero_std": 1.0, "grad_norm": 0.018017614366701927, "kl": 0.00151824951171875, "learning_rate": 9.171456839890407e-07, "loss": 0.0001, "num_tokens": 3669439.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 319.5, "completions/mean_terminated_length": 319.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.2772648200187341, "frac_reward_zero_std": 1.0, "grad_norm": 0.01363275452233916, "kl": 0.001010894775390625, "learning_rate": 9.16902454410468e-07, "loss": 0.0, "num_tokens": 3673219.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 642.0, "completions/max_terminated_length": 642.0, "completions/mean_length": 370.375, "completions/mean_terminated_length": 370.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.27753245015388733, "frac_reward_zero_std": 1.0, "grad_norm": 0.020844356520021208, "kl": 0.00154876708984375, "learning_rate": 9.166589046501289e-07, "loss": 0.0001, "num_tokens": 3677318.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.2778000802890405, "frac_reward_zero_std": 1.0, "grad_norm": 0.012106046249992116, "kl": 0.0007534027099609375, "learning_rate": 9.164150349205599e-07, "loss": 0.0, "num_tokens": 3680484.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.27806771042419376, "frac_reward_zero_std": 1.0, "grad_norm": 0.015691215700372737, "kl": 0.001224517822265625, "learning_rate": 9.161708454345772e-07, "loss": 0.0, "num_tokens": 3683285.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.278335340559347, "frac_reward_zero_std": 1.0, "grad_norm": 0.012796547693072351, "kl": 0.001155853271484375, "learning_rate": 9.159263364052749e-07, "loss": 0.0, "num_tokens": 3686723.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 309.5, "completions/mean_terminated_length": 309.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2786029706945002, "frac_reward_zero_std": 1.0, "grad_norm": 0.19846147160942068, "kl": 0.004482269287109375, "learning_rate": 9.156815080460276e-07, "loss": 0.0002, "num_tokens": 3690335.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 344.25, "completions/mean_terminated_length": 344.25, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.2788706008296534, "frac_reward_zero_std": 1.0, "grad_norm": 0.321667111277745, "kl": 0.00830078125, "learning_rate": 9.15436360570487e-07, "loss": 0.0003, "num_tokens": 3694289.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.2791382309648066, "frac_reward_zero_std": 1.0, "grad_norm": 0.01718156805070264, "kl": 0.00121307373046875, "learning_rate": 9.151908941925843e-07, "loss": 0.0, "num_tokens": 3697571.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.27940586109995985, "frac_reward_zero_std": 0.5, "grad_norm": 1.2959140521801535, "kl": 0.00118255615234375, "learning_rate": 9.149451091265286e-07, "loss": -0.0102, "num_tokens": 3701570.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.2796734912351131, "frac_reward_zero_std": 1.0, "grad_norm": 0.010248089910094915, "kl": 0.000728607177734375, "learning_rate": 9.146990055868068e-07, "loss": 0.0, "num_tokens": 3705224.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 316.5, "completions/mean_terminated_length": 316.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.2799411213702663, "frac_reward_zero_std": 1.0, "grad_norm": 0.018654311467150863, "kl": 0.0013885498046875, "learning_rate": 9.144525837881844e-07, "loss": 0.0001, "num_tokens": 3708668.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 255.375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2802087515054195, "frac_reward_zero_std": 0.5, "grad_norm": 0.7827673028482914, "kl": 0.00243377685546875, "learning_rate": 9.142058439457043e-07, "loss": 0.0113, "num_tokens": 3711703.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 298.875, "completions/mean_terminated_length": 298.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.2804763816405727, "frac_reward_zero_std": 1.0, "grad_norm": 0.025837528705821775, "kl": 0.001811981201171875, "learning_rate": 9.139587862746869e-07, "loss": 0.0001, "num_tokens": 3715282.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 223.5, "completions/mean_terminated_length": 223.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.28074401177572594, "frac_reward_zero_std": 1.0, "grad_norm": 0.014632371432334656, "kl": 0.001201629638671875, "learning_rate": 9.137114109907299e-07, "loss": 0.0, "num_tokens": 3718134.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.2810116419108792, "frac_reward_zero_std": 1.0, "grad_norm": 0.02368213188952698, "kl": 0.001476287841796875, "learning_rate": 9.134637183097083e-07, "loss": 0.0001, "num_tokens": 3721098.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 365.625, "completions/mean_terminated_length": 365.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.28127927204603237, "frac_reward_zero_std": 1.0, "grad_norm": 0.00881087758919663, "kl": 0.000659942626953125, "learning_rate": 9.13215708447774e-07, "loss": 0.0, "num_tokens": 3725055.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2815469021811856, "frac_reward_zero_std": 1.0, "grad_norm": 0.026297766311202078, "kl": 0.00238037109375, "learning_rate": 9.12967381621356e-07, "loss": 0.0001, "num_tokens": 3728540.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 755.0, "completions/max_terminated_length": 755.0, "completions/mean_length": 386.125, "completions/mean_terminated_length": 386.125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.2818145323163388, "frac_reward_zero_std": 1.0, "grad_norm": 0.021408593037282417, "kl": 0.00193023681640625, "learning_rate": 9.127187380471594e-07, "loss": 0.0001, "num_tokens": 3732793.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 268.25, "completions/mean_terminated_length": 268.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.28208216245149204, "frac_reward_zero_std": 1.0, "grad_norm": 0.010790774869619074, "kl": 0.00092315673828125, "learning_rate": 9.12469777942166e-07, "loss": 0.0, "num_tokens": 3735835.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 242.25, "completions/mean_terminated_length": 242.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.2823497925866453, "frac_reward_zero_std": 1.0, "grad_norm": 0.09177547245357137, "kl": 0.0035858154296875, "learning_rate": 9.122205015236337e-07, "loss": 0.0001, "num_tokens": 3738633.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 345.5, "completions/mean_terminated_length": 345.5, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.28261742272179846, "frac_reward_zero_std": 1.0, "grad_norm": 0.020792280565605368, "kl": 0.002025604248046875, "learning_rate": 9.119709090090968e-07, "loss": 0.0001, "num_tokens": 3742517.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 273.25, "completions/mean_terminated_length": 273.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.2828850528569517, "frac_reward_zero_std": 1.0, "grad_norm": 0.019461153967580944, "kl": 0.0013275146484375, "learning_rate": 9.117210006163647e-07, "loss": 0.0001, "num_tokens": 3745727.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 409.0, "completions/mean_terminated_length": 409.0, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.2831526829921049, "frac_reward_zero_std": 1.0, "grad_norm": 0.023580056489274488, "kl": 0.00127410888671875, "learning_rate": 9.114707765635235e-07, "loss": 0.0001, "num_tokens": 3750091.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 480.625, "completions/mean_terminated_length": 480.625, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.2834203131272581, "frac_reward_zero_std": 0.5, "grad_norm": 0.7484911016884013, "kl": 0.000934600830078125, "learning_rate": 9.112202370689335e-07, "loss": 0.035, "num_tokens": 3755212.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.28368794326241137, "frac_reward_zero_std": 1.0, "grad_norm": 0.012479524132069944, "kl": 0.0009517669677734375, "learning_rate": 9.109693823512316e-07, "loss": 0.0, "num_tokens": 3758327.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 245.375, "completions/mean_terminated_length": 245.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.28395557339756455, "frac_reward_zero_std": 1.0, "grad_norm": 0.09639080983050495, "kl": 0.0035858154296875, "learning_rate": 9.107182126293287e-07, "loss": 0.0001, "num_tokens": 3761318.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 260.5, "completions/mean_terminated_length": 260.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2842232035327178, "frac_reward_zero_std": 1.0, "grad_norm": 0.01472480149782918, "kl": 0.00077056884765625, "learning_rate": 9.104667281224113e-07, "loss": 0.0, "num_tokens": 3764430.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 313.625, "completions/mean_terminated_length": 313.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.284490833667871, "frac_reward_zero_std": 1.0, "grad_norm": 0.013849355983955156, "kl": 0.001094818115234375, "learning_rate": 9.102149290499401e-07, "loss": 0.0, "num_tokens": 3767927.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 355.5, "completions/mean_terminated_length": 355.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.2847584638030242, "frac_reward_zero_std": 0.5, "grad_norm": 0.5411798311128734, "kl": 0.001049041748046875, "learning_rate": 9.099628156316508e-07, "loss": 0.0395, "num_tokens": 3771867.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1005.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 472.0, "completions/mean_terminated_length": 472.0, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.28502609393817746, "frac_reward_zero_std": 1.0, "grad_norm": 0.02500168789125578, "kl": 0.001331329345703125, "learning_rate": 9.097103880875529e-07, "loss": 0.0001, "num_tokens": 3776811.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 353.5, "completions/mean_terminated_length": 353.5, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.28529372407333065, "frac_reward_zero_std": 1.0, "grad_norm": 0.011338510382269167, "kl": 0.0009860992431640625, "learning_rate": 9.094576466379303e-07, "loss": 0.0, "num_tokens": 3780639.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 254.5, "completions/mean_terminated_length": 254.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.2855613542084839, "frac_reward_zero_std": 1.0, "grad_norm": 0.019859313595262617, "kl": 0.00142669677734375, "learning_rate": 9.09204591503341e-07, "loss": 0.0001, "num_tokens": 3783695.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.28582898434363707, "frac_reward_zero_std": 1.0, "grad_norm": 0.021131067006440966, "kl": 0.001384735107421875, "learning_rate": 9.089512229046166e-07, "loss": 0.0001, "num_tokens": 3787294.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 298.875, "completions/mean_terminated_length": 298.875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.2860966144787903, "frac_reward_zero_std": 1.0, "grad_norm": 0.017921368435966336, "kl": 0.0014801025390625, "learning_rate": 9.086975410628619e-07, "loss": 0.0001, "num_tokens": 3790833.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.28636424461394355, "frac_reward_zero_std": 1.0, "grad_norm": 0.018584709568752494, "kl": 0.00096893310546875, "learning_rate": 9.084435461994555e-07, "loss": 0.0, "num_tokens": 3794443.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.28663187474909674, "frac_reward_zero_std": 1.0, "grad_norm": 0.015431424146737004, "kl": 0.0008983612060546875, "learning_rate": 9.081892385360489e-07, "loss": 0.0, "num_tokens": 3797372.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.28689950488425, "frac_reward_zero_std": 1.0, "grad_norm": 0.011696517028048753, "kl": 0.0006313323974609375, "learning_rate": 9.079346182945671e-07, "loss": 0.0, "num_tokens": 3801086.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 452.75, "completions/mean_terminated_length": 452.75, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.28716713501940316, "frac_reward_zero_std": 1.0, "grad_norm": 0.01258248865439835, "kl": 0.001255035400390625, "learning_rate": 9.076796856972071e-07, "loss": 0.0001, "num_tokens": 3805820.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 451.0, "completions/mean_terminated_length": 451.0, "completions/min_length": 390.0, "completions/min_terminated_length": 390.0, "epoch": 0.2874347651545564, "frac_reward_zero_std": 1.0, "grad_norm": 0.016706904222523352, "kl": 0.001190185546875, "learning_rate": 9.07424440966439e-07, "loss": 0.0, "num_tokens": 3810604.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 439.125, "completions/mean_terminated_length": 439.125, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.28770239528970964, "frac_reward_zero_std": 0.5, "grad_norm": 0.7380839667210434, "kl": 0.00313568115234375, "learning_rate": 9.071688843250049e-07, "loss": 0.0978, "num_tokens": 3815301.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 385.0, "completions/mean_terminated_length": 385.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.28797002542486283, "frac_reward_zero_std": 0.5, "grad_norm": 0.6297650901901082, "kl": 0.0008258819580078125, "learning_rate": 9.069130159959198e-07, "loss": -0.001, "num_tokens": 3819693.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 246.375, "completions/mean_terminated_length": 246.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.28823765556001607, "frac_reward_zero_std": 1.0, "grad_norm": 0.021214396042563914, "kl": 0.001361846923828125, "learning_rate": 9.066568362024696e-07, "loss": 0.0001, "num_tokens": 3822600.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.28850528569516926, "frac_reward_zero_std": 0.5, "grad_norm": 0.8305201713397328, "kl": 0.0008754730224609375, "learning_rate": 9.064003451682132e-07, "loss": -0.0712, "num_tokens": 3826658.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 329.75, "completions/mean_terminated_length": 329.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.2887729158303225, "frac_reward_zero_std": 1.0, "grad_norm": 0.014421991286567988, "kl": 0.000904083251953125, "learning_rate": 9.061435431169802e-07, "loss": 0.0, "num_tokens": 3830256.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.28904054596547574, "frac_reward_zero_std": 0.5, "grad_norm": 0.9832115415075717, "kl": 0.0011196136474609375, "learning_rate": 9.058864302728721e-07, "loss": 0.0201, "num_tokens": 3834455.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 340.0, "completions/mean_terminated_length": 340.0, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.2893081761006289, "frac_reward_zero_std": 1.0, "grad_norm": 0.008297757319437576, "kl": 0.0007343292236328125, "learning_rate": 9.056290068602612e-07, "loss": 0.0, "num_tokens": 3838183.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 458.75, "completions/mean_terminated_length": 458.75, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.28957580623578216, "frac_reward_zero_std": 0.5, "grad_norm": 0.6726362724935157, "kl": 0.002147674560546875, "learning_rate": 9.053712731037914e-07, "loss": -0.0604, "num_tokens": 3843021.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.28984343637093535, "frac_reward_zero_std": 0.5, "grad_norm": 0.5725554062976517, "kl": 0.0011749267578125, "learning_rate": 9.051132292283771e-07, "loss": 0.0, "num_tokens": 3846613.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.2901110665060886, "frac_reward_zero_std": 1.0, "grad_norm": 0.014890914465274864, "kl": 0.0015716552734375, "learning_rate": 9.04854875459203e-07, "loss": 0.0001, "num_tokens": 3850116.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.29037869664124183, "frac_reward_zero_std": 0.5, "grad_norm": 1.329485226836198, "kl": 0.0028533935546875, "learning_rate": 9.045962120217251e-07, "loss": 0.0638, "num_tokens": 3854153.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 317.125, "completions/mean_terminated_length": 317.125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.290646326776395, "frac_reward_zero_std": 0.5, "grad_norm": 0.7824470560591101, "kl": 0.0020294189453125, "learning_rate": 9.043372391416686e-07, "loss": -0.0187, "num_tokens": 3857914.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 303.0, "completions/mean_terminated_length": 303.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.29091395691154825, "frac_reward_zero_std": 1.0, "grad_norm": 0.024080727921058618, "kl": 0.002223968505859375, "learning_rate": 9.040779570450297e-07, "loss": 0.0001, "num_tokens": 3861370.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 246.875, "completions/mean_terminated_length": 246.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.29118158704670144, "frac_reward_zero_std": 1.0, "grad_norm": 0.015864490538303754, "kl": 0.00101470947265625, "learning_rate": 9.038183659580736e-07, "loss": 0.0, "num_tokens": 3864265.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 349.5, "completions/mean_terminated_length": 349.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.2914492171818547, "frac_reward_zero_std": 1.0, "grad_norm": 0.01003939120703764, "kl": 0.0006103515625, "learning_rate": 9.035584661073355e-07, "loss": 0.0, "num_tokens": 3868137.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.2917168473170079, "frac_reward_zero_std": 0.5, "grad_norm": 0.9164681351472381, "kl": 0.001953125, "learning_rate": 9.032982577196206e-07, "loss": 0.0378, "num_tokens": 3872013.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.2919844774521611, "frac_reward_zero_std": 1.0, "grad_norm": 0.008152158933919476, "kl": 0.0005064010620117188, "learning_rate": 9.030377410220023e-07, "loss": 0.0, "num_tokens": 3875727.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.29225210758731435, "frac_reward_zero_std": 1.0, "grad_norm": 0.025155923704956163, "kl": 0.001644134521484375, "learning_rate": 9.027769162418239e-07, "loss": 0.0001, "num_tokens": 3879739.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 270.625, "completions/mean_terminated_length": 270.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.29251973772246753, "frac_reward_zero_std": 1.0, "grad_norm": 0.027451445096603178, "kl": 0.00185394287109375, "learning_rate": 9.025157836066969e-07, "loss": 0.0001, "num_tokens": 3882876.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 365.875, "completions/mean_terminated_length": 365.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.2927873678576208, "frac_reward_zero_std": 1.0, "grad_norm": 0.06526560369396625, "kl": 0.0030364990234375, "learning_rate": 9.022543433445022e-07, "loss": 0.0001, "num_tokens": 3886911.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.293054997992774, "frac_reward_zero_std": 1.0, "grad_norm": 0.02271796115283391, "kl": 0.001186370849609375, "learning_rate": 9.019925956833882e-07, "loss": 0.0, "num_tokens": 3890469.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.2933226281279272, "frac_reward_zero_std": 1.0, "grad_norm": 0.02091458720247283, "kl": 0.002056121826171875, "learning_rate": 9.017305408517726e-07, "loss": 0.0001, "num_tokens": 3893829.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 281.875, "completions/mean_terminated_length": 281.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.29359025826308044, "frac_reward_zero_std": 1.0, "grad_norm": 0.012164803692025133, "kl": 0.00090789794921875, "learning_rate": 9.014681790783406e-07, "loss": 0.0, "num_tokens": 3897088.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 218.125, "completions/mean_terminated_length": 218.125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.2938578883982336, "frac_reward_zero_std": 1.0, "grad_norm": 0.039327220060338154, "kl": 0.00276947021484375, "learning_rate": 9.01205510592045e-07, "loss": 0.0001, "num_tokens": 3900073.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.29412551853338686, "frac_reward_zero_std": 1.0, "grad_norm": 0.012233882984326228, "kl": 0.001026153564453125, "learning_rate": 9.009425356221068e-07, "loss": 0.0, "num_tokens": 3903602.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 182.375, "completions/mean_terminated_length": 182.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.29439314866854005, "frac_reward_zero_std": 1.0, "grad_norm": 0.017188886311939915, "kl": 0.001186370849609375, "learning_rate": 9.00679254398014e-07, "loss": 0.0, "num_tokens": 3905921.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 242.125, "completions/mean_terminated_length": 242.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.2946607788036933, "frac_reward_zero_std": 1.0, "grad_norm": 0.03705052160258421, "kl": 0.00234222412109375, "learning_rate": 9.004156671495222e-07, "loss": 0.0001, "num_tokens": 3908746.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 353.875, "completions/mean_terminated_length": 353.875, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.29492840893884653, "frac_reward_zero_std": 1.0, "grad_norm": 0.028454532697625576, "kl": 0.002056121826171875, "learning_rate": 9.001517741066543e-07, "loss": 0.0001, "num_tokens": 3912765.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.2951960390739997, "frac_reward_zero_std": 1.0, "grad_norm": 0.07834853946698485, "kl": 0.00257110595703125, "learning_rate": 8.998875754996989e-07, "loss": 0.0001, "num_tokens": 3916111.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.29546366920915296, "frac_reward_zero_std": 1.0, "grad_norm": 0.01658340795742568, "kl": 0.00103759765625, "learning_rate": 8.996230715592127e-07, "loss": 0.0, "num_tokens": 3919213.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.29573129934430614, "frac_reward_zero_std": 1.0, "grad_norm": 0.025380474185103568, "kl": 0.001552581787109375, "learning_rate": 8.993582625160179e-07, "loss": 0.0001, "num_tokens": 3922308.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.2959989294794594, "frac_reward_zero_std": 1.0, "grad_norm": 0.03391670455596781, "kl": 0.002227783203125, "learning_rate": 8.990931486012034e-07, "loss": 0.0001, "num_tokens": 3925323.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 375.25, "completions/mean_terminated_length": 375.25, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.2962665596146126, "frac_reward_zero_std": 1.0, "grad_norm": 0.009011380345357875, "kl": 0.0007266998291015625, "learning_rate": 8.988277300461238e-07, "loss": 0.0, "num_tokens": 3929237.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 196.375, "completions/mean_terminated_length": 196.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.2965341897497658, "frac_reward_zero_std": 1.0, "grad_norm": 0.040975627552290755, "kl": 0.00274658203125, "learning_rate": 8.985620070823997e-07, "loss": 0.0001, "num_tokens": 3931952.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.29680181988491905, "frac_reward_zero_std": 1.0, "grad_norm": 0.013692151726588652, "kl": 0.001007080078125, "learning_rate": 8.982959799419178e-07, "loss": 0.0, "num_tokens": 3935538.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 262.125, "completions/mean_terminated_length": 262.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.29706945002007223, "frac_reward_zero_std": 1.0, "grad_norm": 0.17616344275762857, "kl": 0.0052490234375, "learning_rate": 8.980296488568294e-07, "loss": 0.0002, "num_tokens": 3938671.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 263.125, "completions/mean_terminated_length": 263.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.2973370801552255, "frac_reward_zero_std": 1.0, "grad_norm": 0.014236380801994936, "kl": 0.001087188720703125, "learning_rate": 8.977630140595518e-07, "loss": 0.0, "num_tokens": 3941848.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.2976047102903787, "frac_reward_zero_std": 1.0, "grad_norm": 0.012831179869880735, "kl": 0.000942230224609375, "learning_rate": 8.974960757827666e-07, "loss": 0.0, "num_tokens": 3945364.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 365.5, "completions/mean_terminated_length": 365.5, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.2978723404255319, "frac_reward_zero_std": 0.5, "grad_norm": 0.7158042833689121, "kl": 0.0008792877197265625, "learning_rate": 8.97228834259421e-07, "loss": 0.1146, "num_tokens": 3949336.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 241.875, "completions/mean_terminated_length": 241.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.29813997056068514, "frac_reward_zero_std": 1.0, "grad_norm": 0.013841788652525257, "kl": 0.000888824462890625, "learning_rate": 8.969612897227264e-07, "loss": 0.0, "num_tokens": 3952343.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.2984076006958383, "frac_reward_zero_std": 1.0, "grad_norm": 0.038134362978282293, "kl": 0.0019378662109375, "learning_rate": 8.966934424061585e-07, "loss": 0.0001, "num_tokens": 3956279.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 479.375, "completions/mean_terminated_length": 479.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.29867523083099157, "frac_reward_zero_std": 1.0, "grad_norm": 0.011589527668654287, "kl": 0.0007648468017578125, "learning_rate": 8.964252925434578e-07, "loss": 0.0, "num_tokens": 3961118.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 368.125, "completions/mean_terminated_length": 368.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.2989428609661448, "frac_reward_zero_std": 1.0, "grad_norm": 0.025134939944901326, "kl": 0.00167083740234375, "learning_rate": 8.961568403686281e-07, "loss": 0.0001, "num_tokens": 3965215.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 302.125, "completions/mean_terminated_length": 302.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.299210491101298, "frac_reward_zero_std": 1.0, "grad_norm": 0.034061896551913246, "kl": 0.00164794921875, "learning_rate": 8.958880861159376e-07, "loss": 0.0001, "num_tokens": 3968652.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 242.0, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.29947812123645123, "frac_reward_zero_std": 1.0, "grad_norm": 0.020740229228510668, "kl": 0.00133514404296875, "learning_rate": 8.956190300199178e-07, "loss": 0.0001, "num_tokens": 3971592.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.2997457513716044, "frac_reward_zero_std": 1.0, "grad_norm": 0.016666746264592292, "kl": 0.001220703125, "learning_rate": 8.953496723153635e-07, "loss": 0.0, "num_tokens": 3974350.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 426.25, "completions/mean_terminated_length": 426.25, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.30001338150675766, "frac_reward_zero_std": 0.5, "grad_norm": 0.6929741185359846, "kl": 0.001728057861328125, "learning_rate": 8.950800132373331e-07, "loss": 0.0402, "num_tokens": 3978680.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3002810116419109, "frac_reward_zero_std": 0.5, "grad_norm": 0.7366120323853431, "kl": 0.001117706298828125, "learning_rate": 8.948100530211479e-07, "loss": 0.0, "num_tokens": 3982094.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 308.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3005486417770641, "frac_reward_zero_std": 1.0, "grad_norm": 0.01685535816561334, "kl": 0.001003265380859375, "learning_rate": 8.945397919023916e-07, "loss": 0.0, "num_tokens": 3985729.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3008162719122173, "frac_reward_zero_std": 1.0, "grad_norm": 0.01962198649648495, "kl": 0.00115203857421875, "learning_rate": 8.942692301169109e-07, "loss": 0.0, "num_tokens": 3989457.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 366.625, "completions/mean_terminated_length": 366.625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.3010839020473705, "frac_reward_zero_std": 1.0, "grad_norm": 0.007880353966983526, "kl": 0.000926971435546875, "learning_rate": 8.939983679008146e-07, "loss": 0.0, "num_tokens": 3993446.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 322.125, "completions/mean_terminated_length": 322.125, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.30135153218252375, "frac_reward_zero_std": 1.0, "grad_norm": 0.013773925178906148, "kl": 0.001140594482421875, "learning_rate": 8.937272054904741e-07, "loss": 0.0, "num_tokens": 3996991.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 250.875, "completions/mean_terminated_length": 250.875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.301619162317677, "frac_reward_zero_std": 1.0, "grad_norm": 0.020137353835073295, "kl": 0.001148223876953125, "learning_rate": 8.934557431225223e-07, "loss": 0.0, "num_tokens": 4000210.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3018867924528302, "frac_reward_zero_std": 1.0, "grad_norm": 0.022107942283901283, "kl": 0.00167083740234375, "learning_rate": 8.93183981033854e-07, "loss": 0.0001, "num_tokens": 4003738.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.3021544225879834, "frac_reward_zero_std": 1.0, "grad_norm": 0.033454999795062765, "kl": 0.0015201568603515625, "learning_rate": 8.929119194616256e-07, "loss": 0.0001, "num_tokens": 4006848.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 320.625, "completions/mean_terminated_length": 320.625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3024220527231366, "frac_reward_zero_std": 1.0, "grad_norm": 0.015459499450132102, "kl": 0.00191497802734375, "learning_rate": 8.926395586432551e-07, "loss": 0.0001, "num_tokens": 4010401.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.30268968285828984, "frac_reward_zero_std": 1.0, "grad_norm": 0.012633128797017137, "kl": 0.0009021759033203125, "learning_rate": 8.923668988164211e-07, "loss": 0.0, "num_tokens": 4013330.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.3029573129934431, "frac_reward_zero_std": 1.0, "grad_norm": 0.014412976177793524, "kl": 0.00124359130859375, "learning_rate": 8.920939402190635e-07, "loss": 0.0, "num_tokens": 4017386.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 356.875, "completions/mean_terminated_length": 356.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.30322494312859627, "frac_reward_zero_std": 1.0, "grad_norm": 0.015620096276887025, "kl": 0.001514434814453125, "learning_rate": 8.91820683089383e-07, "loss": 0.0001, "num_tokens": 4021269.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 315.375, "completions/mean_terminated_length": 315.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3034925732637495, "frac_reward_zero_std": 1.0, "grad_norm": 0.009803919446507648, "kl": 0.000682830810546875, "learning_rate": 8.915471276658404e-07, "loss": 0.0, "num_tokens": 4024864.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.3037602033989027, "frac_reward_zero_std": 0.5, "grad_norm": 1.4376940443410755, "kl": 0.0041656494140625, "learning_rate": 8.912732741871572e-07, "loss": -0.0195, "num_tokens": 4027908.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.30402783353405594, "frac_reward_zero_std": 1.0, "grad_norm": 0.013657043535490533, "kl": 0.00103759765625, "learning_rate": 8.90999122892315e-07, "loss": 0.0, "num_tokens": 4031289.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.3042954636692092, "frac_reward_zero_std": 0.5, "grad_norm": 1.0179487261874471, "kl": 0.001033782958984375, "learning_rate": 8.907246740205551e-07, "loss": -0.047, "num_tokens": 4034520.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 277.5, "completions/mean_terminated_length": 277.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.30456309380436236, "frac_reward_zero_std": 1.0, "grad_norm": 0.017790058752515242, "kl": 0.000926971435546875, "learning_rate": 8.904499278113784e-07, "loss": 0.0, "num_tokens": 4037792.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3048307239395156, "frac_reward_zero_std": 1.0, "grad_norm": 0.047391033920582776, "kl": 0.003875732421875, "learning_rate": 8.901748845045456e-07, "loss": 0.0002, "num_tokens": 4041245.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 360.125, "completions/mean_terminated_length": 360.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3050983540746688, "frac_reward_zero_std": 1.0, "grad_norm": 0.013154714661054287, "kl": 0.000957489013671875, "learning_rate": 8.898995443400765e-07, "loss": 0.0, "num_tokens": 4045278.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 426.375, "completions/mean_terminated_length": 426.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.30536598420982203, "frac_reward_zero_std": 1.0, "grad_norm": 0.010993987622307336, "kl": 0.00091552734375, "learning_rate": 8.896239075582498e-07, "loss": 0.0, "num_tokens": 4049917.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 283.875, "completions/mean_terminated_length": 283.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.30563361434497527, "frac_reward_zero_std": 1.0, "grad_norm": 0.023645876355125228, "kl": 0.001232147216796875, "learning_rate": 8.893479743996033e-07, "loss": 0.0, "num_tokens": 4053216.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.30590124448012845, "frac_reward_zero_std": 1.0, "grad_norm": 0.07173449453186728, "kl": 0.00232696533203125, "learning_rate": 8.890717451049333e-07, "loss": 0.0001, "num_tokens": 4056176.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 351.5, "completions/mean_terminated_length": 351.5, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.3061688746152817, "frac_reward_zero_std": 1.0, "grad_norm": 0.011209634048009452, "kl": 0.0010013580322265625, "learning_rate": 8.887952199152946e-07, "loss": 0.0, "num_tokens": 4059972.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 189.625, "completions/mean_terminated_length": 189.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3064365047504349, "frac_reward_zero_std": 1.0, "grad_norm": 0.010934139245912971, "kl": 0.0005588531494140625, "learning_rate": 8.885183990720003e-07, "loss": 0.0, "num_tokens": 4062329.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 349.375, "completions/mean_terminated_length": 349.375, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.3067041348855881, "frac_reward_zero_std": 1.0, "grad_norm": 0.013270279799500097, "kl": 0.001026153564453125, "learning_rate": 8.882412828166211e-07, "loss": 0.0, "num_tokens": 4066140.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.30697176502074136, "frac_reward_zero_std": 1.0, "grad_norm": 0.014368006025703728, "kl": 0.001194000244140625, "learning_rate": 8.87963871390986e-07, "loss": 0.0, "num_tokens": 4069433.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 295.375, "completions/mean_terminated_length": 295.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.30723939515589455, "frac_reward_zero_std": 1.0, "grad_norm": 0.061086036787993646, "kl": 0.0052642822265625, "learning_rate": 8.876861650371812e-07, "loss": 0.0002, "num_tokens": 4072964.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3075070252910478, "frac_reward_zero_std": 0.5, "grad_norm": 0.7208201125071694, "kl": 0.00157928466796875, "learning_rate": 8.874081639975507e-07, "loss": 0.0001, "num_tokens": 4076045.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.30777465542620097, "frac_reward_zero_std": 1.0, "grad_norm": 0.04382539829142723, "kl": 0.00308990478515625, "learning_rate": 8.871298685146951e-07, "loss": 0.0001, "num_tokens": 4079500.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.3080422855613542, "frac_reward_zero_std": 0.0, "grad_norm": 1.5817027824852574, "kl": 0.001918792724609375, "learning_rate": 8.868512788314722e-07, "loss": -0.0263, "num_tokens": 4083139.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.30830991569650745, "frac_reward_zero_std": 0.5, "grad_norm": 1.0245470236682857, "kl": 0.001796722412109375, "learning_rate": 8.865723951909971e-07, "loss": 0.0273, "num_tokens": 4086053.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.30857754583166064, "frac_reward_zero_std": 0.5, "grad_norm": 0.8748928648891222, "kl": 0.00104522705078125, "learning_rate": 8.862932178366401e-07, "loss": -0.0088, "num_tokens": 4090257.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 316.625, "completions/mean_terminated_length": 316.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.3088451759668139, "frac_reward_zero_std": 1.0, "grad_norm": 0.029255827555815465, "kl": 0.002140045166015625, "learning_rate": 8.860137470120294e-07, "loss": 0.0001, "num_tokens": 4093882.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.30911280610196706, "frac_reward_zero_std": 1.0, "grad_norm": 0.02523022122509671, "kl": 0.001110076904296875, "learning_rate": 8.857339829610482e-07, "loss": 0.0, "num_tokens": 4097536.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3093804362371203, "frac_reward_zero_std": 1.0, "grad_norm": 0.011866992620266548, "kl": 0.0006761550903320312, "learning_rate": 8.854539259278357e-07, "loss": 0.0, "num_tokens": 4100314.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 295.375, "completions/mean_terminated_length": 295.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.30964806637227354, "frac_reward_zero_std": 0.5, "grad_norm": 1.0887082155505576, "kl": 0.00264739990234375, "learning_rate": 8.851735761567874e-07, "loss": 0.0277, "num_tokens": 4103937.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 351.5, "completions/mean_terminated_length": 351.5, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.30991569650742673, "frac_reward_zero_std": 1.0, "grad_norm": 0.012362757740480417, "kl": 0.00109100341796875, "learning_rate": 8.848929338925535e-07, "loss": 0.0, "num_tokens": 4107869.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.31018332664257997, "frac_reward_zero_std": 1.0, "grad_norm": 0.019359485507925815, "kl": 0.0008697509765625, "learning_rate": 8.846119993800399e-07, "loss": 0.0, "num_tokens": 4111333.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.31045095677773316, "frac_reward_zero_std": 1.0, "grad_norm": 0.014265390423597552, "kl": 0.00130462646484375, "learning_rate": 8.843307728644075e-07, "loss": 0.0001, "num_tokens": 4115224.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3107185869128864, "frac_reward_zero_std": 1.0, "grad_norm": 0.03724878883367649, "kl": 0.002349853515625, "learning_rate": 8.840492545910718e-07, "loss": 0.0001, "num_tokens": 4118645.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 285.875, "completions/mean_terminated_length": 285.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.3109862170480396, "frac_reward_zero_std": 1.0, "grad_norm": 0.021304253896188306, "kl": 0.001220703125, "learning_rate": 8.837674448057032e-07, "loss": 0.0, "num_tokens": 4121936.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 365.75, "completions/mean_terminated_length": 365.75, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3112538471831928, "frac_reward_zero_std": 1.0, "grad_norm": 0.017700680273865503, "kl": 0.001308441162109375, "learning_rate": 8.834853437542264e-07, "loss": 0.0001, "num_tokens": 4125838.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.31152147731834606, "frac_reward_zero_std": 1.0, "grad_norm": 0.015001102273263584, "kl": 0.0013065338134765625, "learning_rate": 8.832029516828199e-07, "loss": 0.0001, "num_tokens": 4129076.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 316.125, "completions/mean_terminated_length": 316.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.31178910745349925, "frac_reward_zero_std": 1.0, "grad_norm": 0.022763720599622235, "kl": 0.0016632080078125, "learning_rate": 8.829202688379167e-07, "loss": 0.0001, "num_tokens": 4132637.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 242.5, "completions/mean_terminated_length": 242.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.3120567375886525, "frac_reward_zero_std": 1.0, "grad_norm": 0.01809390462471421, "kl": 0.0016632080078125, "learning_rate": 8.826372954662033e-07, "loss": 0.0001, "num_tokens": 4135509.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 214.0, "completions/mean_terminated_length": 214.0, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.3123243677238057, "frac_reward_zero_std": 1.0, "grad_norm": 0.012177355840345554, "kl": 0.0008678436279296875, "learning_rate": 8.8235403181462e-07, "loss": 0.0, "num_tokens": 4138197.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 188.75, "completions/mean_terminated_length": 188.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.3125919978589589, "frac_reward_zero_std": 1.0, "grad_norm": 0.018316460400110368, "kl": 0.00138092041015625, "learning_rate": 8.820704781303596e-07, "loss": 0.0001, "num_tokens": 4140667.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.31285962799411215, "frac_reward_zero_std": 1.0, "grad_norm": 0.029318151030526603, "kl": 0.00334930419921875, "learning_rate": 8.817866346608692e-07, "loss": 0.0001, "num_tokens": 4143704.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.31312725812926534, "frac_reward_zero_std": 1.0, "grad_norm": 0.017205267562162972, "kl": 0.00101470947265625, "learning_rate": 8.815025016538476e-07, "loss": 0.0, "num_tokens": 4147187.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.3133948882644186, "frac_reward_zero_std": 1.0, "grad_norm": 0.012102332309575168, "kl": 0.0006885528564453125, "learning_rate": 8.812180793572473e-07, "loss": 0.0, "num_tokens": 4150464.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.31366251839957177, "frac_reward_zero_std": 1.0, "grad_norm": 0.0182929504331773, "kl": 0.001216888427734375, "learning_rate": 8.809333680192727e-07, "loss": 0.0, "num_tokens": 4153858.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.313930148534725, "frac_reward_zero_std": 1.0, "grad_norm": 0.007043361773525957, "kl": 0.0004749298095703125, "learning_rate": 8.806483678883801e-07, "loss": 0.0, "num_tokens": 4157363.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.31419777866987825, "frac_reward_zero_std": 1.0, "grad_norm": 0.0180031987840543, "kl": 0.00086212158203125, "learning_rate": 8.80363079213279e-07, "loss": 0.0, "num_tokens": 4160590.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 222.875, "completions/mean_terminated_length": 222.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.31446540880503143, "frac_reward_zero_std": 1.0, "grad_norm": 0.021041770873791388, "kl": 0.001949310302734375, "learning_rate": 8.800775022429291e-07, "loss": 0.0001, "num_tokens": 4163221.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 254.75, "completions/mean_terminated_length": 254.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3147330389401847, "frac_reward_zero_std": 1.0, "grad_norm": 0.027379615180185078, "kl": 0.00159454345703125, "learning_rate": 8.79791637226543e-07, "loss": 0.0001, "num_tokens": 4166199.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 209.625, "completions/mean_terminated_length": 209.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.31500066907533786, "frac_reward_zero_std": 1.0, "grad_norm": 0.00878545182148966, "kl": 0.0010986328125, "learning_rate": 8.79505484413584e-07, "loss": 0.0, "num_tokens": 4168804.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3152682992104911, "frac_reward_zero_std": 1.0, "grad_norm": 0.03009783474185433, "kl": 0.0016117095947265625, "learning_rate": 8.792190440537669e-07, "loss": 0.0001, "num_tokens": 4172259.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.31553592934564434, "frac_reward_zero_std": 1.0, "grad_norm": 0.03078653529277556, "kl": 0.00199127197265625, "learning_rate": 8.789323163970573e-07, "loss": 0.0001, "num_tokens": 4175580.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 320.625, "completions/mean_terminated_length": 320.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.3158035594807975, "frac_reward_zero_std": 1.0, "grad_norm": 0.0608226799162096, "kl": 0.0029754638671875, "learning_rate": 8.786453016936711e-07, "loss": 0.0001, "num_tokens": 4179157.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.31607118961595077, "frac_reward_zero_std": 0.5, "grad_norm": 0.6908532522592502, "kl": 0.00153350830078125, "learning_rate": 8.783580001940756e-07, "loss": -0.0014, "num_tokens": 4182850.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.31633881975110395, "frac_reward_zero_std": 1.0, "grad_norm": 0.015416765951737285, "kl": 0.00154876708984375, "learning_rate": 8.780704121489875e-07, "loss": 0.0001, "num_tokens": 4185771.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3166064498862572, "frac_reward_zero_std": 1.0, "grad_norm": 0.13103414543396644, "kl": 0.0037078857421875, "learning_rate": 8.77782537809374e-07, "loss": 0.0001, "num_tokens": 4189043.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.31687408002141043, "frac_reward_zero_std": 1.0, "grad_norm": 0.023543834603288245, "kl": 0.001468658447265625, "learning_rate": 8.774943774264521e-07, "loss": 0.0001, "num_tokens": 4192209.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.3171417101565636, "frac_reward_zero_std": 0.0, "grad_norm": 1.346096163853665, "kl": 0.00186920166015625, "learning_rate": 8.772059312516883e-07, "loss": -0.02, "num_tokens": 4196064.0, "reward": 0.5, "reward_std": 0.5773502588272095, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.31740934029171686, "frac_reward_zero_std": 1.0, "grad_norm": 0.030147505564015114, "kl": 0.0014190673828125, "learning_rate": 8.769171995367987e-07, "loss": 0.0001, "num_tokens": 4199011.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.31767697042687004, "frac_reward_zero_std": 0.5, "grad_norm": 0.8757618222808912, "kl": 0.001018524169921875, "learning_rate": 8.76628182533748e-07, "loss": 0.0128, "num_tokens": 4202846.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 438.625, "completions/mean_terminated_length": 438.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3179446005620233, "frac_reward_zero_std": 1.0, "grad_norm": 0.014242779894923908, "kl": 0.001617431640625, "learning_rate": 8.763388804947509e-07, "loss": 0.0001, "num_tokens": 4207419.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.3182122306971765, "frac_reward_zero_std": 0.5, "grad_norm": 0.8653225895796511, "kl": 0.0050506591796875, "learning_rate": 8.7604929367227e-07, "loss": -0.0194, "num_tokens": 4211138.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 337.625, "completions/mean_terminated_length": 337.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.3184798608323297, "frac_reward_zero_std": 0.5, "grad_norm": 0.5492391052592854, "kl": 0.0015411376953125, "learning_rate": 8.757594223190167e-07, "loss": 0.0405, "num_tokens": 4214987.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 279.75, "completions/mean_terminated_length": 279.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.31874749096748295, "frac_reward_zero_std": 1.0, "grad_norm": 0.011123133025116562, "kl": 0.0007724761962890625, "learning_rate": 8.754692666879504e-07, "loss": 0.0, "num_tokens": 4218437.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.31901512110263613, "frac_reward_zero_std": 1.0, "grad_norm": 0.014077147009498152, "kl": 0.0010223388671875, "learning_rate": 8.751788270322792e-07, "loss": 0.0, "num_tokens": 4221948.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 370.25, "completions/mean_terminated_length": 370.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.3192827512377894, "frac_reward_zero_std": 1.0, "grad_norm": 0.011666821666120363, "kl": 0.00096893310546875, "learning_rate": 8.748881036054585e-07, "loss": 0.0, "num_tokens": 4226078.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3195503813729426, "frac_reward_zero_std": 1.0, "grad_norm": 0.01107166910906599, "kl": 0.00090789794921875, "learning_rate": 8.745970966611916e-07, "loss": 0.0, "num_tokens": 4229324.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 367.375, "completions/mean_terminated_length": 367.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.3198180115080958, "frac_reward_zero_std": 0.5, "grad_norm": 0.8667496726703746, "kl": 0.002323150634765625, "learning_rate": 8.743058064534294e-07, "loss": 0.0001, "num_tokens": 4233427.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 358.0, "completions/mean_terminated_length": 358.0, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.32008564164324904, "frac_reward_zero_std": 1.0, "grad_norm": 0.02152706905655654, "kl": 0.001399993896484375, "learning_rate": 8.740142332363692e-07, "loss": 0.0001, "num_tokens": 4237523.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 436.75, "completions/mean_terminated_length": 436.75, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.3203532717784022, "frac_reward_zero_std": 1.0, "grad_norm": 0.009755435109294625, "kl": 0.00106048583984375, "learning_rate": 8.73722377264456e-07, "loss": 0.0, "num_tokens": 4242393.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.32062090191355547, "frac_reward_zero_std": 1.0, "grad_norm": 0.020357377722567724, "kl": 0.0022430419921875, "learning_rate": 8.734302387923817e-07, "loss": 0.0001, "num_tokens": 4245864.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 233.5, "completions/mean_terminated_length": 233.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3208885320487087, "frac_reward_zero_std": 1.0, "grad_norm": 0.023056886569502277, "kl": 0.0012359619140625, "learning_rate": 8.731378180750842e-07, "loss": 0.0, "num_tokens": 4248668.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3211561621838619, "frac_reward_zero_std": 1.0, "grad_norm": 0.01260108351325428, "kl": 0.00121307373046875, "learning_rate": 8.728451153677478e-07, "loss": 0.0, "num_tokens": 4251732.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 333.875, "completions/mean_terminated_length": 333.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.32142379231901513, "frac_reward_zero_std": 0.5, "grad_norm": 0.702171252474359, "kl": 0.00182342529296875, "learning_rate": 8.72552130925803e-07, "loss": 0.0039, "num_tokens": 4255371.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 380.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.3216914224541683, "frac_reward_zero_std": 1.0, "grad_norm": 0.016728167718293233, "kl": 0.001445770263671875, "learning_rate": 8.722588650049263e-07, "loss": 0.0001, "num_tokens": 4259573.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.32195905258932156, "frac_reward_zero_std": 1.0, "grad_norm": 0.018520001811128162, "kl": 0.001224517822265625, "learning_rate": 8.719653178610394e-07, "loss": 0.0, "num_tokens": 4263167.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 317.75, "completions/mean_terminated_length": 317.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.3222266827244748, "frac_reward_zero_std": 1.0, "grad_norm": 0.01616854376918311, "kl": 0.001140594482421875, "learning_rate": 8.7167148975031e-07, "loss": 0.0, "num_tokens": 4266701.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 299.375, "completions/mean_terminated_length": 299.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.322494312859628, "frac_reward_zero_std": 1.0, "grad_norm": 0.038225816106433934, "kl": 0.00165557861328125, "learning_rate": 8.713773809291505e-07, "loss": 0.0001, "num_tokens": 4270128.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3227619429947812, "frac_reward_zero_std": 1.0, "grad_norm": 0.012057353885055766, "kl": 0.0014133453369140625, "learning_rate": 8.710829916542184e-07, "loss": 0.0001, "num_tokens": 4273998.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 323.25, "completions/mean_terminated_length": 323.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3230295731299344, "frac_reward_zero_std": 1.0, "grad_norm": 0.014807509098132837, "kl": 0.00141143798828125, "learning_rate": 8.70788322182416e-07, "loss": 0.0001, "num_tokens": 4277836.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 311.625, "completions/mean_terminated_length": 311.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.32329720326508765, "frac_reward_zero_std": 1.0, "grad_norm": 0.04755756598943994, "kl": 0.0025177001953125, "learning_rate": 8.704933727708901e-07, "loss": 0.0001, "num_tokens": 4281525.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3235648334002409, "frac_reward_zero_std": 0.5, "grad_norm": 1.18847104365541, "kl": 0.003856658935546875, "learning_rate": 8.70198143677032e-07, "loss": -0.0312, "num_tokens": 4284892.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 331.75, "completions/mean_terminated_length": 331.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3238324635353941, "frac_reward_zero_std": 1.0, "grad_norm": 0.07488556034595835, "kl": 0.0025787353515625, "learning_rate": 8.699026351584769e-07, "loss": 0.0001, "num_tokens": 4288526.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3241000936705473, "frac_reward_zero_std": 1.0, "grad_norm": 0.013524053468999469, "kl": 0.0006389617919921875, "learning_rate": 8.696068474731031e-07, "loss": 0.0, "num_tokens": 4291941.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3243677238057005, "frac_reward_zero_std": 1.0, "grad_norm": 0.02113707507920205, "kl": 0.0016040802001953125, "learning_rate": 8.69310780879034e-07, "loss": 0.0001, "num_tokens": 4294908.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 274.875, "completions/mean_terminated_length": 274.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.32463535394085374, "frac_reward_zero_std": 1.0, "grad_norm": 0.03036430479383965, "kl": 0.002101898193359375, "learning_rate": 8.690144356346354e-07, "loss": 0.0001, "num_tokens": 4298095.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.324902984076007, "frac_reward_zero_std": 1.0, "grad_norm": 0.01066980822037888, "kl": 0.000759124755859375, "learning_rate": 8.687178119985164e-07, "loss": 0.0, "num_tokens": 4301698.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 266.0, "completions/mean_terminated_length": 266.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.32517061421116017, "frac_reward_zero_std": 1.0, "grad_norm": 0.00985415949124388, "kl": 0.0006513595581054688, "learning_rate": 8.68420910229529e-07, "loss": 0.0, "num_tokens": 4304782.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 355.375, "completions/mean_terminated_length": 355.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3254382443463134, "frac_reward_zero_std": 1.0, "grad_norm": 0.011678776174526269, "kl": 0.0009174346923828125, "learning_rate": 8.681237305867682e-07, "loss": 0.0, "num_tokens": 4308665.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 341.375, "completions/mean_terminated_length": 341.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3257058744814666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01650008426870687, "kl": 0.001861572265625, "learning_rate": 8.678262733295715e-07, "loss": 0.0001, "num_tokens": 4312544.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 403.0, "completions/mean_terminated_length": 403.0, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.32597350461661984, "frac_reward_zero_std": 0.5, "grad_norm": 0.60512346869414, "kl": 0.003688812255859375, "learning_rate": 8.675285387175181e-07, "loss": -0.0102, "num_tokens": 4316684.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 187.625, "completions/mean_terminated_length": 187.625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.3262411347517731, "frac_reward_zero_std": 1.0, "grad_norm": 0.015759838888083622, "kl": 0.00106048583984375, "learning_rate": 8.672305270104301e-07, "loss": 0.0, "num_tokens": 4319101.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 263.25, "completions/mean_terminated_length": 263.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.32650876488692626, "frac_reward_zero_std": 0.5, "grad_norm": 0.9601718993502839, "kl": 0.00235748291015625, "learning_rate": 8.669322384683705e-07, "loss": -0.0133, "num_tokens": 4322255.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 357.875, "completions/mean_terminated_length": 357.875, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.3267763950220795, "frac_reward_zero_std": 0.5, "grad_norm": 0.8396060743109907, "kl": 0.001495361328125, "learning_rate": 8.666336733516445e-07, "loss": 0.0001, "num_tokens": 4326222.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.3270440251572327, "frac_reward_zero_std": 1.0, "grad_norm": 0.011888709529615479, "kl": 0.0008563995361328125, "learning_rate": 8.663348319207985e-07, "loss": 0.0, "num_tokens": 4329569.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.32731165529238593, "frac_reward_zero_std": 1.0, "grad_norm": 0.03522025389649976, "kl": 0.001880645751953125, "learning_rate": 8.660357144366201e-07, "loss": 0.0001, "num_tokens": 4333342.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 296.0, "completions/mean_terminated_length": 296.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.32757928542753917, "frac_reward_zero_std": 1.0, "grad_norm": 0.021402675951182417, "kl": 0.0014801025390625, "learning_rate": 8.657363211601374e-07, "loss": 0.0001, "num_tokens": 4336690.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.32784691556269235, "frac_reward_zero_std": 1.0, "grad_norm": 0.012489564963891121, "kl": 0.00106048583984375, "learning_rate": 8.654366523526199e-07, "loss": 0.0, "num_tokens": 4340019.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 237.125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.3281145456978456, "frac_reward_zero_std": 1.0, "grad_norm": 0.027578093170497364, "kl": 0.0018310546875, "learning_rate": 8.651367082755767e-07, "loss": 0.0001, "num_tokens": 4342884.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.3283821758329988, "frac_reward_zero_std": 1.0, "grad_norm": 0.016761878388755848, "kl": 0.00119781494140625, "learning_rate": 8.648364891907579e-07, "loss": 0.0, "num_tokens": 4346152.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 302.25, "completions/mean_terminated_length": 302.25, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.328649805968152, "frac_reward_zero_std": 1.0, "grad_norm": 0.05457416671443051, "kl": 0.0017242431640625, "learning_rate": 8.645359953601532e-07, "loss": 0.0001, "num_tokens": 4349642.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.3289174361033052, "frac_reward_zero_std": 1.0, "grad_norm": 0.02545213790210443, "kl": 0.001895904541015625, "learning_rate": 8.642352270459918e-07, "loss": 0.0001, "num_tokens": 4353012.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 212.5, "completions/mean_terminated_length": 212.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.32918506623845845, "frac_reward_zero_std": 1.0, "grad_norm": 0.019939459139921265, "kl": 0.00162506103515625, "learning_rate": 8.639341845107432e-07, "loss": 0.0001, "num_tokens": 4355620.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3294526963736117, "frac_reward_zero_std": 1.0, "grad_norm": 0.015903836009613117, "kl": 0.0012969970703125, "learning_rate": 8.636328680171152e-07, "loss": 0.0001, "num_tokens": 4358851.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 355.875, "completions/mean_terminated_length": 355.875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.32972032650876487, "frac_reward_zero_std": 1.0, "grad_norm": 0.061330514915266664, "kl": 0.00318145751953125, "learning_rate": 8.633312778280555e-07, "loss": 0.0001, "num_tokens": 4362946.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3299879566439181, "frac_reward_zero_std": 1.0, "grad_norm": 0.017650402608603347, "kl": 0.001720428466796875, "learning_rate": 8.630294142067504e-07, "loss": 0.0001, "num_tokens": 4366777.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3302555867790713, "frac_reward_zero_std": 1.0, "grad_norm": 0.014110829260302297, "kl": 0.001155853271484375, "learning_rate": 8.627272774166245e-07, "loss": 0.0, "num_tokens": 4370173.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 222.0, "completions/mean_terminated_length": 222.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.33052321691422454, "frac_reward_zero_std": 1.0, "grad_norm": 0.011996660764697005, "kl": 0.0008392333984375, "learning_rate": 8.624248677213413e-07, "loss": 0.0, "num_tokens": 4373041.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 231.5, "completions/mean_terminated_length": 231.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.3307908470493778, "frac_reward_zero_std": 1.0, "grad_norm": 0.02266266609826258, "kl": 0.001361846923828125, "learning_rate": 8.621221853848021e-07, "loss": 0.0001, "num_tokens": 4375917.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 246.125, "completions/mean_terminated_length": 246.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.33105847718453096, "frac_reward_zero_std": 1.0, "grad_norm": 0.03934308661534641, "kl": 0.0013370513916015625, "learning_rate": 8.618192306711462e-07, "loss": 0.0001, "num_tokens": 4378794.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 313.625, "completions/mean_terminated_length": 313.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.3313261073196842, "frac_reward_zero_std": 1.0, "grad_norm": 0.1270798447999048, "kl": 0.002269744873046875, "learning_rate": 8.615160038447507e-07, "loss": 0.0001, "num_tokens": 4382291.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 328.375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.3315937374548374, "frac_reward_zero_std": 1.0, "grad_norm": 0.030502585123681, "kl": 0.00252532958984375, "learning_rate": 8.612125051702299e-07, "loss": 0.0001, "num_tokens": 4386134.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 306.5, "completions/mean_terminated_length": 306.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.33186136758999063, "frac_reward_zero_std": 1.0, "grad_norm": 0.013942851963972195, "kl": 0.000965118408203125, "learning_rate": 8.609087349124358e-07, "loss": 0.0, "num_tokens": 4389606.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 295.5, "completions/mean_terminated_length": 295.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.33212899772514387, "frac_reward_zero_std": 1.0, "grad_norm": 0.045373768466812354, "kl": 0.003673553466796875, "learning_rate": 8.606046933364567e-07, "loss": 0.0001, "num_tokens": 4393018.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 307.125, "completions/mean_terminated_length": 307.125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.33239662786029706, "frac_reward_zero_std": 1.0, "grad_norm": 0.03591890815453233, "kl": 0.0023651123046875, "learning_rate": 8.603003807076184e-07, "loss": 0.0001, "num_tokens": 4396575.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 239.125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.3326642579954503, "frac_reward_zero_std": 1.0, "grad_norm": 0.10024298252079107, "kl": 0.0048675537109375, "learning_rate": 8.599957972914826e-07, "loss": 0.0002, "num_tokens": 4399628.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.3329318881306035, "frac_reward_zero_std": 1.0, "grad_norm": 0.013253360743846273, "kl": 0.00109100341796875, "learning_rate": 8.596909433538482e-07, "loss": 0.0, "num_tokens": 4402589.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.3331995182657567, "frac_reward_zero_std": 1.0, "grad_norm": 0.017420086976598206, "kl": 0.0009326934814453125, "learning_rate": 8.593858191607491e-07, "loss": 0.0, "num_tokens": 4405185.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.33346714840090996, "frac_reward_zero_std": 1.0, "grad_norm": 0.07848670275811201, "kl": 0.00322723388671875, "learning_rate": 8.590804249784557e-07, "loss": 0.0001, "num_tokens": 4408392.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 434.5, "completions/mean_terminated_length": 434.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.33373477853606315, "frac_reward_zero_std": 1.0, "grad_norm": 0.027677855193656925, "kl": 0.001743316650390625, "learning_rate": 8.587747610734738e-07, "loss": 0.0001, "num_tokens": 4413028.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 237.125, "completions/mean_terminated_length": 237.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.3340024086712164, "frac_reward_zero_std": 1.0, "grad_norm": 0.015365297796990753, "kl": 0.000957489013671875, "learning_rate": 8.584688277125445e-07, "loss": 0.0, "num_tokens": 4415861.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 344.625, "completions/mean_terminated_length": 344.625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.3342700388063696, "frac_reward_zero_std": 1.0, "grad_norm": 0.013048940367134304, "kl": 0.000988006591796875, "learning_rate": 8.581626251626445e-07, "loss": 0.0, "num_tokens": 4419742.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 402.375, "completions/mean_terminated_length": 402.375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.3345376689415228, "frac_reward_zero_std": 1.0, "grad_norm": 0.014453231901186556, "kl": 0.00164031982421875, "learning_rate": 8.578561536909848e-07, "loss": 0.0001, "num_tokens": 4424297.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 295.25, "completions/mean_terminated_length": 295.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.33480529907667606, "frac_reward_zero_std": 0.5, "grad_norm": 0.590453500383657, "kl": 0.0020599365234375, "learning_rate": 8.575494135650115e-07, "loss": 0.023, "num_tokens": 4427651.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 249.0, "completions/mean_terminated_length": 249.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.33507292921182924, "frac_reward_zero_std": 1.0, "grad_norm": 0.011666854618186381, "kl": 0.00101470947265625, "learning_rate": 8.572424050524048e-07, "loss": 0.0, "num_tokens": 4430571.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.3353405593469825, "frac_reward_zero_std": 1.0, "grad_norm": 0.012471643831197327, "kl": 0.000659942626953125, "learning_rate": 8.569351284210795e-07, "loss": 0.0, "num_tokens": 4433734.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 336.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.33560818948213567, "frac_reward_zero_std": 1.0, "grad_norm": 0.016447651415972552, "kl": 0.001678466796875, "learning_rate": 8.566275839391841e-07, "loss": 0.0001, "num_tokens": 4437510.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 284.25, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.3358758196172889, "frac_reward_zero_std": 0.5, "grad_norm": 0.8332105480000377, "kl": 0.0008792877197265625, "learning_rate": 8.563197718751011e-07, "loss": 0.0205, "num_tokens": 4440720.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.33614344975244215, "frac_reward_zero_std": 1.0, "grad_norm": 0.02292719246130776, "kl": 0.001659393310546875, "learning_rate": 8.560116924974461e-07, "loss": 0.0001, "num_tokens": 4444080.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.33641107988759533, "frac_reward_zero_std": 1.0, "grad_norm": 0.041351437954237416, "kl": 0.0027313232421875, "learning_rate": 8.557033460750684e-07, "loss": 0.0001, "num_tokens": 4447674.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 445.0, "completions/mean_terminated_length": 445.0, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.3366787100227486, "frac_reward_zero_std": 0.5, "grad_norm": 0.38965256579138197, "kl": 0.0012569427490234375, "learning_rate": 8.553947328770498e-07, "loss": 0.083, "num_tokens": 4452294.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.33694634015790176, "frac_reward_zero_std": 1.0, "grad_norm": 0.012667799502385602, "kl": 0.000797271728515625, "learning_rate": 8.550858531727058e-07, "loss": 0.0, "num_tokens": 4455697.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 339.25, "completions/mean_terminated_length": 339.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.337213970293055, "frac_reward_zero_std": 1.0, "grad_norm": 0.04141114242763092, "kl": 0.002979278564453125, "learning_rate": 8.547767072315834e-07, "loss": 0.0001, "num_tokens": 4459399.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.33748160042820824, "frac_reward_zero_std": 1.0, "grad_norm": 0.011463568756494463, "kl": 0.000606536865234375, "learning_rate": 8.544672953234628e-07, "loss": 0.0, "num_tokens": 4462874.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 236.0, "completions/mean_terminated_length": 236.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.3377492305633614, "frac_reward_zero_std": 1.0, "grad_norm": 0.01439191959105911, "kl": 0.001338958740234375, "learning_rate": 8.54157617718356e-07, "loss": 0.0001, "num_tokens": 4465750.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.33801686069851467, "frac_reward_zero_std": 1.0, "grad_norm": 0.010287237453707427, "kl": 0.001125335693359375, "learning_rate": 8.538476746865066e-07, "loss": 0.0, "num_tokens": 4469056.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.33828449083366785, "frac_reward_zero_std": 1.0, "grad_norm": 0.011904281648415534, "kl": 0.0005130767822265625, "learning_rate": 8.535374664983902e-07, "loss": 0.0, "num_tokens": 4472070.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.3385521209688211, "frac_reward_zero_std": 0.5, "grad_norm": 0.6822435941068832, "kl": 0.00144195556640625, "learning_rate": 8.532269934247135e-07, "loss": 0.0162, "num_tokens": 4475818.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 473.25, "completions/mean_terminated_length": 473.25, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.33881975110397433, "frac_reward_zero_std": 1.0, "grad_norm": 0.011890001441422722, "kl": 0.001373291015625, "learning_rate": 8.529162557364147e-07, "loss": 0.0001, "num_tokens": 4480744.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 376.125, "completions/mean_terminated_length": 376.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.3390873812391275, "frac_reward_zero_std": 1.0, "grad_norm": 0.009540005327838923, "kl": 0.0008544921875, "learning_rate": 8.526052537046625e-07, "loss": 0.0, "num_tokens": 4484853.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 236.5, "completions/mean_terminated_length": 236.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.33935501137428076, "frac_reward_zero_std": 1.0, "grad_norm": 0.015229599610226454, "kl": 0.000881195068359375, "learning_rate": 8.522939876008567e-07, "loss": 0.0, "num_tokens": 4487757.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.33962264150943394, "frac_reward_zero_std": 1.0, "grad_norm": 0.01818600370771412, "kl": 0.001194000244140625, "learning_rate": 8.519824576966273e-07, "loss": 0.0, "num_tokens": 4490717.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.3398902716445872, "frac_reward_zero_std": 1.0, "grad_norm": 0.014543856289406114, "kl": 0.001476287841796875, "learning_rate": 8.516706642638344e-07, "loss": 0.0001, "num_tokens": 4494727.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3401579017797404, "frac_reward_zero_std": 1.0, "grad_norm": 0.029921618410059143, "kl": 0.00177764892578125, "learning_rate": 8.513586075745684e-07, "loss": 0.0001, "num_tokens": 4498392.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 225.625, "completions/mean_terminated_length": 225.625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.3404255319148936, "frac_reward_zero_std": 1.0, "grad_norm": 0.01604066703589348, "kl": 0.0009860992431640625, "learning_rate": 8.510462879011492e-07, "loss": 0.0, "num_tokens": 4501281.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 399.125, "completions/mean_terminated_length": 399.125, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.34069316205004685, "frac_reward_zero_std": 1.0, "grad_norm": 0.027382569091407707, "kl": 0.001674652099609375, "learning_rate": 8.507337055161262e-07, "loss": 0.0001, "num_tokens": 4505822.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 362.5, "completions/mean_terminated_length": 362.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.34096079218520003, "frac_reward_zero_std": 1.0, "grad_norm": 0.01757209800187514, "kl": 0.001667022705078125, "learning_rate": 8.504208606922781e-07, "loss": 0.0001, "num_tokens": 4509926.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.3412284223203533, "frac_reward_zero_std": 1.0, "grad_norm": 0.01993066325259242, "kl": 0.00138092041015625, "learning_rate": 8.501077537026129e-07, "loss": 0.0001, "num_tokens": 4513150.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 357.875, "completions/mean_terminated_length": 357.875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.3414960524555065, "frac_reward_zero_std": 0.5, "grad_norm": 0.5771337527030064, "kl": 0.001331329345703125, "learning_rate": 8.497943848203669e-07, "loss": 0.0098, "num_tokens": 4517313.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 364.875, "completions/mean_terminated_length": 364.875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.3417636825906597, "frac_reward_zero_std": 1.0, "grad_norm": 0.01929972630483027, "kl": 0.001598358154296875, "learning_rate": 8.494807543190051e-07, "loss": 0.0001, "num_tokens": 4521316.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.34203131272581294, "frac_reward_zero_std": 1.0, "grad_norm": 0.00894470352490886, "kl": 0.0006732940673828125, "learning_rate": 8.49166862472221e-07, "loss": 0.0, "num_tokens": 4524275.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.3422989428609661, "frac_reward_zero_std": 1.0, "grad_norm": 0.016350749793861814, "kl": 0.00140380859375, "learning_rate": 8.48852709553936e-07, "loss": 0.0001, "num_tokens": 4527348.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.34256657299611937, "frac_reward_zero_std": 1.0, "grad_norm": 0.04916864843639528, "kl": 0.00264739990234375, "learning_rate": 8.485382958382994e-07, "loss": 0.0001, "num_tokens": 4530586.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.3428342031312726, "frac_reward_zero_std": 1.0, "grad_norm": 0.04439568576514583, "kl": 0.0023040771484375, "learning_rate": 8.48223621599688e-07, "loss": 0.0001, "num_tokens": 4534057.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.3431018332664258, "frac_reward_zero_std": 1.0, "grad_norm": 0.016710224878929265, "kl": 0.00157928466796875, "learning_rate": 8.479086871127057e-07, "loss": 0.0001, "num_tokens": 4537303.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.34336946340157903, "frac_reward_zero_std": 1.0, "grad_norm": 0.01599349493179072, "kl": 0.0010623931884765625, "learning_rate": 8.475934926521844e-07, "loss": 0.0, "num_tokens": 4540522.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.3436370935367322, "frac_reward_zero_std": 1.0, "grad_norm": 0.01967646374291685, "kl": 0.00118255615234375, "learning_rate": 8.472780384931819e-07, "loss": 0.0, "num_tokens": 4543798.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 429.875, "completions/mean_terminated_length": 429.875, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.34390472367188546, "frac_reward_zero_std": 0.5, "grad_norm": 0.9603644953830571, "kl": 0.00140380859375, "learning_rate": 8.469623249109831e-07, "loss": -0.0512, "num_tokens": 4548569.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 315.25, "completions/mean_terminated_length": 315.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.3441723538070387, "frac_reward_zero_std": 1.0, "grad_norm": 0.015296260836302278, "kl": 0.0006256103515625, "learning_rate": 8.46646352181099e-07, "loss": 0.0, "num_tokens": 4552087.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 228.375, "completions/mean_terminated_length": 228.375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.3444399839421919, "frac_reward_zero_std": 0.5, "grad_norm": 0.8609702063454261, "kl": 0.001125335693359375, "learning_rate": 8.463301205792675e-07, "loss": 0.0404, "num_tokens": 4554834.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 366.0, "completions/mean_terminated_length": 366.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.3447076140773451, "frac_reward_zero_std": 1.0, "grad_norm": 0.014925623258583593, "kl": 0.00164031982421875, "learning_rate": 8.46013630381451e-07, "loss": 0.0001, "num_tokens": 4558826.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 403.875, "completions/mean_terminated_length": 403.875, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.3449752442124983, "frac_reward_zero_std": 1.0, "grad_norm": 0.010582164736873574, "kl": 0.000873565673828125, "learning_rate": 8.456968818638392e-07, "loss": 0.0, "num_tokens": 4563137.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.34524287434765155, "frac_reward_zero_std": 1.0, "grad_norm": 0.04191552669082859, "kl": 0.00196075439453125, "learning_rate": 8.45379875302846e-07, "loss": 0.0001, "num_tokens": 4565965.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 365.125, "completions/mean_terminated_length": 365.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.34551050448280474, "frac_reward_zero_std": 1.0, "grad_norm": 0.044350963932211114, "kl": 0.0018768310546875, "learning_rate": 8.450626109751108e-07, "loss": 0.0001, "num_tokens": 4569946.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.345778134617958, "frac_reward_zero_std": 1.0, "grad_norm": 0.02543579449546997, "kl": 0.00226593017578125, "learning_rate": 8.447450891574985e-07, "loss": 0.0001, "num_tokens": 4573375.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 364.875, "completions/mean_terminated_length": 364.875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.3460457647531112, "frac_reward_zero_std": 0.0, "grad_norm": 0.8920771159208051, "kl": 0.0008697509765625, "learning_rate": 8.444273101270981e-07, "loss": 0.0281, "num_tokens": 4577358.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 369.75, "completions/mean_terminated_length": 369.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.3463133948882644, "frac_reward_zero_std": 1.0, "grad_norm": 0.02129074347886217, "kl": 0.00164031982421875, "learning_rate": 8.441092741612233e-07, "loss": 0.0001, "num_tokens": 4581660.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 248.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.34658102502341764, "frac_reward_zero_std": 1.0, "grad_norm": 0.026508355976212695, "kl": 0.00133514404296875, "learning_rate": 8.437909815374116e-07, "loss": 0.0001, "num_tokens": 4584760.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 436.125, "completions/mean_terminated_length": 436.125, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "epoch": 0.34684865515857083, "frac_reward_zero_std": 1.0, "grad_norm": 0.011677439076696822, "kl": 0.0010528564453125, "learning_rate": 8.434724325334251e-07, "loss": 0.0, "num_tokens": 4589273.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.34711628529372407, "frac_reward_zero_std": 1.0, "grad_norm": 0.013580399666396242, "kl": 0.00098419189453125, "learning_rate": 8.431536274272496e-07, "loss": 0.0, "num_tokens": 4592307.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 320.125, "completions/mean_terminated_length": 320.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.3473839154288773, "frac_reward_zero_std": 1.0, "grad_norm": 0.012669661207217964, "kl": 0.0006046295166015625, "learning_rate": 8.428345664970935e-07, "loss": 0.0, "num_tokens": 4595820.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.3476515455640305, "frac_reward_zero_std": 1.0, "grad_norm": 0.016995508186735158, "kl": 0.0012378692626953125, "learning_rate": 8.425152500213897e-07, "loss": 0.0, "num_tokens": 4598450.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 328.125, "completions/mean_terminated_length": 328.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.34791917569918374, "frac_reward_zero_std": 0.5, "grad_norm": 1.31696402943726, "kl": 0.00334930419921875, "learning_rate": 8.421956782787932e-07, "loss": 0.0678, "num_tokens": 4602535.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 386.875, "completions/mean_terminated_length": 386.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.3481868058343369, "frac_reward_zero_std": 0.5, "grad_norm": 0.5690530989986303, "kl": 0.0032501220703125, "learning_rate": 8.418758515481822e-07, "loss": -0.0177, "num_tokens": 4606874.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 334.125, "completions/mean_terminated_length": 334.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.34845443596949016, "frac_reward_zero_std": 1.0, "grad_norm": 0.0280846389688033, "kl": 0.002101898193359375, "learning_rate": 8.41555770108657e-07, "loss": 0.0001, "num_tokens": 4610651.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3487220661046434, "frac_reward_zero_std": 1.0, "grad_norm": 0.01752324572922759, "kl": 0.001079559326171875, "learning_rate": 8.412354342395408e-07, "loss": 0.0, "num_tokens": 4614197.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.3489896962397966, "frac_reward_zero_std": 1.0, "grad_norm": 0.0251590673956811, "kl": 0.002044677734375, "learning_rate": 8.409148442203784e-07, "loss": 0.0001, "num_tokens": 4616892.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.34925732637494983, "frac_reward_zero_std": 1.0, "grad_norm": 0.03696507314410182, "kl": 0.00247955322265625, "learning_rate": 8.405940003309364e-07, "loss": 0.0001, "num_tokens": 4620132.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 318.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.349524956510103, "frac_reward_zero_std": 1.0, "grad_norm": 0.011677968270194457, "kl": 0.000812530517578125, "learning_rate": 8.402729028512029e-07, "loss": 0.0, "num_tokens": 4623852.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.34979258664525625, "frac_reward_zero_std": 0.5, "grad_norm": 1.000394280955742, "kl": 0.001087188720703125, "learning_rate": 8.399515520613877e-07, "loss": -0.0186, "num_tokens": 4626861.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.3500602167804095, "frac_reward_zero_std": 0.5, "grad_norm": 0.9431600751563941, "kl": 0.0025634765625, "learning_rate": 8.396299482419212e-07, "loss": 0.0001, "num_tokens": 4630301.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.3503278469155627, "frac_reward_zero_std": 1.0, "grad_norm": 0.00973478974916967, "kl": 0.00043773651123046875, "learning_rate": 8.393080916734547e-07, "loss": 0.0, "num_tokens": 4633166.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 368.875, "completions/mean_terminated_length": 368.875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3505954770507159, "frac_reward_zero_std": 0.5, "grad_norm": 0.7340032886029357, "kl": 0.001708984375, "learning_rate": 8.389859826368603e-07, "loss": -0.0076, "num_tokens": 4637113.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.3508631071858691, "frac_reward_zero_std": 1.0, "grad_norm": 0.020000767422905165, "kl": 0.001712799072265625, "learning_rate": 8.386636214132301e-07, "loss": 0.0001, "num_tokens": 4640518.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.35113073732102235, "frac_reward_zero_std": 1.0, "grad_norm": 0.014343633202063584, "kl": 0.00090789794921875, "learning_rate": 8.383410082838765e-07, "loss": 0.0, "num_tokens": 4643937.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 319.0, "completions/mean_terminated_length": 319.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.3513983674561756, "frac_reward_zero_std": 1.0, "grad_norm": 0.015256276526117869, "kl": 0.0013427734375, "learning_rate": 8.380181435303318e-07, "loss": 0.0001, "num_tokens": 4647893.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 209.875, "completions/mean_terminated_length": 209.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.35166599759132877, "frac_reward_zero_std": 1.0, "grad_norm": 0.02222083292335405, "kl": 0.00176239013671875, "learning_rate": 8.376950274343476e-07, "loss": 0.0001, "num_tokens": 4650752.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.351933627726482, "frac_reward_zero_std": 1.0, "grad_norm": 0.021349746000756696, "kl": 0.00177764892578125, "learning_rate": 8.373716602778951e-07, "loss": 0.0001, "num_tokens": 4653628.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 368.5, "completions/mean_terminated_length": 368.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.3522012578616352, "frac_reward_zero_std": 1.0, "grad_norm": 0.1941483011780532, "kl": 0.0059661865234375, "learning_rate": 8.370480423431644e-07, "loss": 0.0002, "num_tokens": 4657952.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 248.375, "completions/mean_terminated_length": 248.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.35246888799678844, "frac_reward_zero_std": 1.0, "grad_norm": 0.010391390061103161, "kl": 0.0005979537963867188, "learning_rate": 8.367241739125644e-07, "loss": 0.0, "num_tokens": 4660923.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 458.75, "completions/mean_terminated_length": 458.75, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.3527365181319417, "frac_reward_zero_std": 0.5, "grad_norm": 0.9280780010594283, "kl": 0.00788116455078125, "learning_rate": 8.364000552687227e-07, "loss": 0.0014, "num_tokens": 4665861.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.35300414826709486, "frac_reward_zero_std": 1.0, "grad_norm": 0.01823819675366886, "kl": 0.0015869140625, "learning_rate": 8.360756866944857e-07, "loss": 0.0001, "num_tokens": 4669705.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 322.875, "completions/mean_terminated_length": 322.875, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.3532717784022481, "frac_reward_zero_std": 1.0, "grad_norm": 0.024079029829905, "kl": 0.002105712890625, "learning_rate": 8.357510684729169e-07, "loss": 0.0001, "num_tokens": 4673332.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.3535394085374013, "frac_reward_zero_std": 1.0, "grad_norm": 0.12823812145008676, "kl": 0.011474609375, "learning_rate": 8.354262008872985e-07, "loss": 0.0005, "num_tokens": 4676669.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.35380703867255453, "frac_reward_zero_std": 1.0, "grad_norm": 0.04668545644562953, "kl": 0.00328826904296875, "learning_rate": 8.351010842211301e-07, "loss": 0.0001, "num_tokens": 4679773.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.35407466880770777, "frac_reward_zero_std": 0.5, "grad_norm": 0.59935540509556, "kl": 0.0016937255859375, "learning_rate": 8.347757187581287e-07, "loss": -0.0505, "num_tokens": 4682877.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.35434229894286096, "frac_reward_zero_std": 1.0, "grad_norm": 0.01752488352628413, "kl": 0.001270294189453125, "learning_rate": 8.34450104782228e-07, "loss": 0.0001, "num_tokens": 4685807.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.3546099290780142, "frac_reward_zero_std": 1.0, "grad_norm": 0.03773931328703462, "kl": 0.00232696533203125, "learning_rate": 8.341242425775791e-07, "loss": 0.0001, "num_tokens": 4689803.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.3548775592131674, "frac_reward_zero_std": 1.0, "grad_norm": 0.020201489368992024, "kl": 0.00165557861328125, "learning_rate": 8.337981324285494e-07, "loss": 0.0001, "num_tokens": 4693288.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 275.0, "completions/mean_terminated_length": 275.0, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.3551451893483206, "frac_reward_zero_std": 1.0, "grad_norm": 0.03610156174639752, "kl": 0.00283050537109375, "learning_rate": 8.334717746197228e-07, "loss": 0.0001, "num_tokens": 4696704.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 282.75, "completions/mean_terminated_length": 282.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.35541281948347386, "frac_reward_zero_std": 0.5, "grad_norm": 1.0584791025286915, "kl": 0.0031280517578125, "learning_rate": 8.331451694358995e-07, "loss": -0.0097, "num_tokens": 4700230.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 315.625, "completions/mean_terminated_length": 315.625, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.35568044961862705, "frac_reward_zero_std": 1.0, "grad_norm": 0.013942469707903913, "kl": 0.0007266998291015625, "learning_rate": 8.328183171620953e-07, "loss": 0.0, "num_tokens": 4703723.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 278.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.3559480797537803, "frac_reward_zero_std": 1.0, "grad_norm": 0.015910327906204853, "kl": 0.0010395050048828125, "learning_rate": 8.324912180835413e-07, "loss": 0.0, "num_tokens": 4706954.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 422.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.3562157098889335, "frac_reward_zero_std": 1.0, "grad_norm": 0.015454110115274615, "kl": 0.000980377197265625, "learning_rate": 8.321638724856847e-07, "loss": 0.0, "num_tokens": 4711435.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.3564833400240867, "frac_reward_zero_std": 1.0, "grad_norm": 0.020286164124455837, "kl": 0.00170135498046875, "learning_rate": 8.318362806541876e-07, "loss": 0.0001, "num_tokens": 4714571.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 289.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.35675097015923996, "frac_reward_zero_std": 1.0, "grad_norm": 0.038144296650340255, "kl": 0.002655029296875, "learning_rate": 8.315084428749269e-07, "loss": 0.0001, "num_tokens": 4717927.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.35701860029439314, "frac_reward_zero_std": 1.0, "grad_norm": 0.013794448264287416, "kl": 0.00098419189453125, "learning_rate": 8.31180359433994e-07, "loss": 0.0, "num_tokens": 4720980.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.3572862304295464, "frac_reward_zero_std": 1.0, "grad_norm": 0.026846294899970903, "kl": 0.00140380859375, "learning_rate": 8.308520306176947e-07, "loss": 0.0001, "num_tokens": 4723967.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.35755386056469957, "frac_reward_zero_std": 1.0, "grad_norm": 0.08799668422781393, "kl": 0.00356292724609375, "learning_rate": 8.305234567125492e-07, "loss": 0.0001, "num_tokens": 4727578.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.3578214906998528, "frac_reward_zero_std": 1.0, "grad_norm": 0.011601399528455722, "kl": 0.000873565673828125, "learning_rate": 8.301946380052912e-07, "loss": 0.0, "num_tokens": 4731333.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.35808912083500605, "frac_reward_zero_std": 1.0, "grad_norm": 0.017400355773845803, "kl": 0.001239776611328125, "learning_rate": 8.298655747828684e-07, "loss": 0.0, "num_tokens": 4735001.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 304.0, "completions/mean_terminated_length": 304.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.35835675097015923, "frac_reward_zero_std": 1.0, "grad_norm": 0.010797825045323993, "kl": 0.0007734298706054688, "learning_rate": 8.295362673324417e-07, "loss": 0.0, "num_tokens": 4738605.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 354.125, "completions/mean_terminated_length": 354.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.3586243811053125, "frac_reward_zero_std": 0.5, "grad_norm": 1.0929412181770497, "kl": 0.002960205078125, "learning_rate": 8.292067159413853e-07, "loss": 0.0702, "num_tokens": 4742602.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.35889201124046566, "frac_reward_zero_std": 1.0, "grad_norm": 0.02133390999178578, "kl": 0.00193023681640625, "learning_rate": 8.288769208972857e-07, "loss": 0.0001, "num_tokens": 4745916.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.3591596413756189, "frac_reward_zero_std": 1.0, "grad_norm": 0.012117096056630963, "kl": 0.00089263916015625, "learning_rate": 8.285468824879428e-07, "loss": 0.0, "num_tokens": 4749283.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.35942727151077214, "frac_reward_zero_std": 1.0, "grad_norm": 0.02744389654438774, "kl": 0.0015163421630859375, "learning_rate": 8.282166010013682e-07, "loss": 0.0001, "num_tokens": 4752161.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 341.625, "completions/mean_terminated_length": 341.625, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.3596949016459253, "frac_reward_zero_std": 1.0, "grad_norm": 0.020526744526248343, "kl": 0.0016021728515625, "learning_rate": 8.278860767257864e-07, "loss": 0.0001, "num_tokens": 4756186.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.35996253178107857, "frac_reward_zero_std": 1.0, "grad_norm": 0.010007296800690579, "kl": 0.000934600830078125, "learning_rate": 8.275553099496329e-07, "loss": 0.0, "num_tokens": 4758888.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.36023016191623175, "frac_reward_zero_std": 1.0, "grad_norm": 0.021914070829791583, "kl": 0.001331329345703125, "learning_rate": 8.272243009615555e-07, "loss": 0.0001, "num_tokens": 4762253.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 427.375, "completions/mean_terminated_length": 427.375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.360497792051385, "frac_reward_zero_std": 1.0, "grad_norm": 0.03217904638579002, "kl": 0.00218963623046875, "learning_rate": 8.26893050050413e-07, "loss": 0.0001, "num_tokens": 4766928.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 306.5, "completions/mean_terminated_length": 306.5, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.36076542218653823, "frac_reward_zero_std": 1.0, "grad_norm": 0.033473928744873115, "kl": 0.00301361083984375, "learning_rate": 8.265615575052753e-07, "loss": 0.0001, "num_tokens": 4770444.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3610330523216914, "frac_reward_zero_std": 1.0, "grad_norm": 0.017933933600907365, "kl": 0.001861572265625, "learning_rate": 8.262298236154238e-07, "loss": 0.0001, "num_tokens": 4773678.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.36130068245684466, "frac_reward_zero_std": 1.0, "grad_norm": 0.01847131304831832, "kl": 0.002315521240234375, "learning_rate": 8.258978486703493e-07, "loss": 0.0001, "num_tokens": 4776879.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.36156831259199784, "frac_reward_zero_std": 1.0, "grad_norm": 0.029995334327978507, "kl": 0.00156402587890625, "learning_rate": 8.25565632959754e-07, "loss": 0.0001, "num_tokens": 4780160.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3618359427271511, "frac_reward_zero_std": 1.0, "grad_norm": 0.019065429582753004, "kl": 0.0014934539794921875, "learning_rate": 8.252331767735499e-07, "loss": 0.0001, "num_tokens": 4782805.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 208.125, "completions/mean_terminated_length": 208.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.36210357286230427, "frac_reward_zero_std": 1.0, "grad_norm": 0.05471388468812278, "kl": 0.00263214111328125, "learning_rate": 8.249004804018588e-07, "loss": 0.0001, "num_tokens": 4785514.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 476.0, "completions/mean_terminated_length": 476.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3623712029974575, "frac_reward_zero_std": 1.0, "grad_norm": 0.010877498922327547, "kl": 0.0012969970703125, "learning_rate": 8.245675441350122e-07, "loss": 0.0001, "num_tokens": 4790458.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 218.375, "completions/mean_terminated_length": 218.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.36263883313261075, "frac_reward_zero_std": 1.0, "grad_norm": 0.018173090389700434, "kl": 0.0011196136474609375, "learning_rate": 8.242343682635508e-07, "loss": 0.0, "num_tokens": 4793153.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.36290646326776393, "frac_reward_zero_std": 1.0, "grad_norm": 0.00983104281846532, "kl": 0.0006227493286132812, "learning_rate": 8.239009530782243e-07, "loss": 0.0, "num_tokens": 4796333.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 359.5, "completions/mean_terminated_length": 359.5, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3631740934029172, "frac_reward_zero_std": 1.0, "grad_norm": 0.01298792526619553, "kl": 0.001255035400390625, "learning_rate": 8.235672988699918e-07, "loss": 0.0001, "num_tokens": 4800297.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.36344172353807036, "frac_reward_zero_std": 0.5, "grad_norm": 1.1727070307761227, "kl": 0.000827789306640625, "learning_rate": 8.2323340593002e-07, "loss": 0.0203, "num_tokens": 4803302.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.3637093536732236, "frac_reward_zero_std": 1.0, "grad_norm": 0.025885988860626135, "kl": 0.00183868408203125, "learning_rate": 8.228992745496851e-07, "loss": 0.0001, "num_tokens": 4807637.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 358.375, "completions/mean_terminated_length": 358.375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.36397698380837684, "frac_reward_zero_std": 1.0, "grad_norm": 0.014103459750464764, "kl": 0.0012378692626953125, "learning_rate": 8.225649050205705e-07, "loss": 0.0, "num_tokens": 4811440.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 269.875, "completions/mean_terminated_length": 269.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.36424461394353, "frac_reward_zero_std": 1.0, "grad_norm": 0.010964732366643554, "kl": 0.0010356903076171875, "learning_rate": 8.222302976344677e-07, "loss": 0.0, "num_tokens": 4814627.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.36451224407868327, "frac_reward_zero_std": 0.0, "grad_norm": 1.4158318769341651, "kl": 0.0018310546875, "learning_rate": 8.218954526833761e-07, "loss": -0.0231, "num_tokens": 4818388.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.36477987421383645, "frac_reward_zero_std": 0.5, "grad_norm": 0.6742764772101809, "kl": 0.003082275390625, "learning_rate": 8.215603704595016e-07, "loss": 0.0048, "num_tokens": 4822270.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.3650475043489897, "frac_reward_zero_std": 1.0, "grad_norm": 0.010387499801938541, "kl": 0.0008029937744140625, "learning_rate": 8.212250512552584e-07, "loss": 0.0, "num_tokens": 4826442.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.36531513448414293, "frac_reward_zero_std": 1.0, "grad_norm": 0.011289494890919098, "kl": 0.0008525848388671875, "learning_rate": 8.208894953632658e-07, "loss": 0.0, "num_tokens": 4829382.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3655827646192961, "frac_reward_zero_std": 1.0, "grad_norm": 0.03728647641800799, "kl": 0.00203704833984375, "learning_rate": 8.205537030763516e-07, "loss": 0.0001, "num_tokens": 4832757.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 318.875, "completions/mean_terminated_length": 318.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.36585039475444936, "frac_reward_zero_std": 0.5, "grad_norm": 0.7176282600251349, "kl": 0.00151824951171875, "learning_rate": 8.202176746875481e-07, "loss": -0.0216, "num_tokens": 4836384.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.36611802488960254, "frac_reward_zero_std": 1.0, "grad_norm": 0.032112004281969385, "kl": 0.00325775146484375, "learning_rate": 8.19881410490095e-07, "loss": 0.0001, "num_tokens": 4839454.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 374.625, "completions/mean_terminated_length": 374.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.3663856550247558, "frac_reward_zero_std": 0.5, "grad_norm": 0.6880468015684553, "kl": 0.00127410888671875, "learning_rate": 8.19544910777437e-07, "loss": -0.1124, "num_tokens": 4843415.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 459.75, "completions/mean_terminated_length": 459.75, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.366653285159909, "frac_reward_zero_std": 0.5, "grad_norm": 0.6754289895588667, "kl": 0.00154876708984375, "learning_rate": 8.192081758432245e-07, "loss": -0.0017, "num_tokens": 4848377.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 398.25, "completions/mean_terminated_length": 398.25, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.3669209152950622, "frac_reward_zero_std": 1.0, "grad_norm": 0.01876753976529766, "kl": 0.001308441162109375, "learning_rate": 8.188712059813135e-07, "loss": 0.0001, "num_tokens": 4852723.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 235.0, "completions/mean_terminated_length": 235.0, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.36718854543021545, "frac_reward_zero_std": 1.0, "grad_norm": 0.022057703502181617, "kl": 0.00173187255859375, "learning_rate": 8.185340014857644e-07, "loss": 0.0001, "num_tokens": 4855499.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.36745617556536864, "frac_reward_zero_std": 1.0, "grad_norm": 0.011858253813000227, "kl": 0.000949859619140625, "learning_rate": 8.181965626508429e-07, "loss": 0.0, "num_tokens": 4858149.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.3677238057005219, "frac_reward_zero_std": 1.0, "grad_norm": 0.016726236390523615, "kl": 0.0013275146484375, "learning_rate": 8.178588897710188e-07, "loss": 0.0001, "num_tokens": 4861411.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 521.0, "completions/mean_terminated_length": 449.14288330078125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.3679914358356751, "frac_reward_zero_std": 0.5, "grad_norm": 0.662039743015024, "kl": 0.001438140869140625, "learning_rate": 8.175209831409666e-07, "loss": 0.0829, "num_tokens": 4866731.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.3682590659708283, "frac_reward_zero_std": 1.0, "grad_norm": 0.02341884854604649, "kl": 0.0023956298828125, "learning_rate": 8.171828430555643e-07, "loss": 0.0001, "num_tokens": 4869668.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 557.25, "completions/mean_terminated_length": 557.25, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.36852669610598154, "frac_reward_zero_std": 0.0, "grad_norm": 0.9540746750603859, "kl": 0.00133514404296875, "learning_rate": 8.16844469809894e-07, "loss": 0.0153, "num_tokens": 4875710.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.36879432624113473, "frac_reward_zero_std": 1.0, "grad_norm": 0.02522980875772962, "kl": 0.00183868408203125, "learning_rate": 8.16505863699241e-07, "loss": 0.0001, "num_tokens": 4878576.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.36906195637628797, "frac_reward_zero_std": 0.5, "grad_norm": 1.4332023371918035, "kl": 0.00217437744140625, "learning_rate": 8.16167025019094e-07, "loss": -0.0185, "num_tokens": 4881499.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 287.375, "completions/mean_terminated_length": 287.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.3693295865114412, "frac_reward_zero_std": 1.0, "grad_norm": 0.015120101422206975, "kl": 0.0014934539794921875, "learning_rate": 8.158279540651444e-07, "loss": 0.0001, "num_tokens": 4884998.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.3695972166465944, "frac_reward_zero_std": 0.5, "grad_norm": 0.9079272699770341, "kl": 0.00121307373046875, "learning_rate": 8.154886511332867e-07, "loss": -0.0093, "num_tokens": 4888796.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 363.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.36986484678174764, "frac_reward_zero_std": 1.0, "grad_norm": 0.023552697371694298, "kl": 0.00188446044921875, "learning_rate": 8.151491165196176e-07, "loss": 0.0001, "num_tokens": 4892711.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 363.375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.3701324769169008, "frac_reward_zero_std": 0.5, "grad_norm": 0.9726915588966861, "kl": 0.001056671142578125, "learning_rate": 8.148093505204358e-07, "loss": -0.0167, "num_tokens": 4896746.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.37040010705205406, "frac_reward_zero_std": 1.0, "grad_norm": 0.012000199189538237, "kl": 0.0010967254638671875, "learning_rate": 8.144693534322424e-07, "loss": 0.0, "num_tokens": 4900499.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 254.0, "completions/mean_terminated_length": 254.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3706677371872073, "frac_reward_zero_std": 1.0, "grad_norm": 0.022092110380890463, "kl": 0.00206756591796875, "learning_rate": 8.141291255517397e-07, "loss": 0.0001, "num_tokens": 4903491.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.3709353673223605, "frac_reward_zero_std": 1.0, "grad_norm": 0.011414165648707817, "kl": 0.000926971435546875, "learning_rate": 8.137886671758316e-07, "loss": 0.0, "num_tokens": 4906745.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.37120299745751373, "frac_reward_zero_std": 1.0, "grad_norm": 0.054499317943106494, "kl": 0.00305938720703125, "learning_rate": 8.134479786016231e-07, "loss": 0.0001, "num_tokens": 4909959.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 462.5, "completions/mean_terminated_length": 462.5, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.3714706275926669, "frac_reward_zero_std": 0.5, "grad_norm": 0.630677288323303, "kl": 0.00109100341796875, "learning_rate": 8.131070601264203e-07, "loss": 0.0367, "num_tokens": 4914679.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 392.875, "completions/mean_terminated_length": 392.875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.37173825772782015, "frac_reward_zero_std": 1.0, "grad_norm": 0.019679988868134567, "kl": 0.001567840576171875, "learning_rate": 8.127659120477294e-07, "loss": 0.0001, "num_tokens": 4918710.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 286.25, "completions/mean_terminated_length": 286.25, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.3720058878629734, "frac_reward_zero_std": 1.0, "grad_norm": 0.027943554662903665, "kl": 0.0019989013671875, "learning_rate": 8.124245346632577e-07, "loss": 0.0001, "num_tokens": 4922028.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3722735179981266, "frac_reward_zero_std": 1.0, "grad_norm": 0.02024335011376883, "kl": 0.00186920166015625, "learning_rate": 8.12082928270912e-07, "loss": 0.0001, "num_tokens": 4925276.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 372.5, "completions/mean_terminated_length": 372.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3725411481332798, "frac_reward_zero_std": 1.0, "grad_norm": 0.02745885389219221, "kl": 0.00138092041015625, "learning_rate": 8.117410931687991e-07, "loss": 0.0001, "num_tokens": 4929428.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.372808778268433, "frac_reward_zero_std": 1.0, "grad_norm": 0.03442496500717143, "kl": 0.00220489501953125, "learning_rate": 8.113990296552255e-07, "loss": 0.0001, "num_tokens": 4932226.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 317.5, "completions/mean_terminated_length": 317.5, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.37307640840358625, "frac_reward_zero_std": 1.0, "grad_norm": 0.02576200546311016, "kl": 0.0018768310546875, "learning_rate": 8.11056738028697e-07, "loss": 0.0001, "num_tokens": 4935886.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 340.5, "completions/mean_terminated_length": 340.5, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.3733440385387395, "frac_reward_zero_std": 1.0, "grad_norm": 0.014822122206528463, "kl": 0.001468658447265625, "learning_rate": 8.107142185879184e-07, "loss": 0.0001, "num_tokens": 4939758.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.37361166867389267, "frac_reward_zero_std": 0.5, "grad_norm": 1.2405435595780259, "kl": 0.0025177001953125, "learning_rate": 8.103714716317936e-07, "loss": 0.0001, "num_tokens": 4943629.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.3738792988090459, "frac_reward_zero_std": 1.0, "grad_norm": 0.02420975693135397, "kl": 0.00220489501953125, "learning_rate": 8.100284974594242e-07, "loss": 0.0001, "num_tokens": 4946635.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 301.5, "completions/mean_terminated_length": 301.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.3741469289441991, "frac_reward_zero_std": 1.0, "grad_norm": 0.011467484804941861, "kl": 0.001094818115234375, "learning_rate": 8.096852963701112e-07, "loss": 0.0, "num_tokens": 4950007.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.37441455907935234, "frac_reward_zero_std": 1.0, "grad_norm": 0.030421651500301166, "kl": 0.001811981201171875, "learning_rate": 8.093418686633531e-07, "loss": 0.0001, "num_tokens": 4952977.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.3746821892145056, "frac_reward_zero_std": 1.0, "grad_norm": 0.05207728202126054, "kl": 0.00689697265625, "learning_rate": 8.089982146388456e-07, "loss": 0.0003, "num_tokens": 4956011.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.37494981934965876, "frac_reward_zero_std": 1.0, "grad_norm": 0.022487991962612017, "kl": 0.0020294189453125, "learning_rate": 8.086543345964832e-07, "loss": 0.0001, "num_tokens": 4959277.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.375217449484812, "frac_reward_zero_std": 1.0, "grad_norm": 0.08145128302517014, "kl": 0.0081329345703125, "learning_rate": 8.083102288363561e-07, "loss": 0.0003, "num_tokens": 4962749.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3754850796199652, "frac_reward_zero_std": 1.0, "grad_norm": 0.040423984070858725, "kl": 0.001697540283203125, "learning_rate": 8.079658976587528e-07, "loss": 0.0001, "num_tokens": 4965997.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 335.875, "completions/mean_terminated_length": 335.875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.37575270975511843, "frac_reward_zero_std": 0.5, "grad_norm": 0.7643162303310568, "kl": 0.0011138916015625, "learning_rate": 8.076213413641579e-07, "loss": 0.0364, "num_tokens": 4969852.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 222.0, "completions/mean_terminated_length": 222.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.37602033989027167, "frac_reward_zero_std": 1.0, "grad_norm": 0.022955968216963055, "kl": 0.00212860107421875, "learning_rate": 8.072765602532526e-07, "loss": 0.0001, "num_tokens": 4972604.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.37628797002542486, "frac_reward_zero_std": 1.0, "grad_norm": 0.019188890449249187, "kl": 0.0015411376953125, "learning_rate": 8.069315546269138e-07, "loss": 0.0001, "num_tokens": 4975444.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 397.125, "completions/mean_terminated_length": 397.125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.3765556001605781, "frac_reward_zero_std": 0.5, "grad_norm": 0.9096893544937814, "kl": 0.00199127197265625, "learning_rate": 8.065863247862151e-07, "loss": -0.0164, "num_tokens": 4979881.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 631.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 387.0, "completions/mean_terminated_length": 387.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.3768232302957313, "frac_reward_zero_std": 0.5, "grad_norm": 0.7325042713025541, "kl": 0.0025482177734375, "learning_rate": 8.062408710324254e-07, "loss": -0.0127, "num_tokens": 4983969.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 364.125, "completions/mean_terminated_length": 364.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.3770908604308845, "frac_reward_zero_std": 1.0, "grad_norm": 0.04203172027527682, "kl": 0.00323486328125, "learning_rate": 8.05895193667009e-07, "loss": 0.0001, "num_tokens": 4987922.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 289.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.37735849056603776, "frac_reward_zero_std": 1.0, "grad_norm": 0.01857734851052412, "kl": 0.001800537109375, "learning_rate": 8.05549292991625e-07, "loss": 0.0001, "num_tokens": 4991418.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.37762612070119095, "frac_reward_zero_std": 1.0, "grad_norm": 0.0988552800680606, "kl": 0.0128021240234375, "learning_rate": 8.052031693081281e-07, "loss": 0.0005, "num_tokens": 4994589.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.3778937508363442, "frac_reward_zero_std": 1.0, "grad_norm": 0.017571150236915006, "kl": 0.001537322998046875, "learning_rate": 8.048568229185672e-07, "loss": 0.0001, "num_tokens": 4998260.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3781613809714974, "frac_reward_zero_std": 1.0, "grad_norm": 0.02226432497257521, "kl": 0.00536346435546875, "learning_rate": 8.045102541251853e-07, "loss": 0.0002, "num_tokens": 5001262.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 465.25, "completions/mean_terminated_length": 465.25, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.3784290111066506, "frac_reward_zero_std": 0.5, "grad_norm": 0.5943497912022254, "kl": 0.00147247314453125, "learning_rate": 8.041634632304202e-07, "loss": 0.1086, "num_tokens": 5005988.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.3786966412418038, "frac_reward_zero_std": 1.0, "grad_norm": 0.024828190753709598, "kl": 0.0025787353515625, "learning_rate": 8.038164505369026e-07, "loss": 0.0001, "num_tokens": 5008844.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 296.5, "completions/mean_terminated_length": 296.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.37896427137695704, "frac_reward_zero_std": 1.0, "grad_norm": 0.009625729441848611, "kl": 0.001132965087890625, "learning_rate": 8.034692163474575e-07, "loss": 0.0, "num_tokens": 5012200.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.3792319015121103, "frac_reward_zero_std": 1.0, "grad_norm": 0.01790213806674243, "kl": 0.001438140869140625, "learning_rate": 8.031217609651029e-07, "loss": 0.0001, "num_tokens": 5015401.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.37949953164726347, "frac_reward_zero_std": 1.0, "grad_norm": 0.026006799147113856, "kl": 0.001491546630859375, "learning_rate": 8.027740846930496e-07, "loss": 0.0001, "num_tokens": 5018905.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 382.875, "completions/mean_terminated_length": 382.875, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.3797671617824167, "frac_reward_zero_std": 1.0, "grad_norm": 0.013198953195914107, "kl": 0.001468658447265625, "learning_rate": 8.024261878347019e-07, "loss": 0.0001, "num_tokens": 5023184.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3800347919175699, "frac_reward_zero_std": 1.0, "grad_norm": 0.043328250133438155, "kl": 0.00336456298828125, "learning_rate": 8.020780706936556e-07, "loss": 0.0001, "num_tokens": 5026716.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.38030242205272313, "frac_reward_zero_std": 1.0, "grad_norm": 0.009516898155810031, "kl": 0.001064300537109375, "learning_rate": 8.017297335736997e-07, "loss": 0.0, "num_tokens": 5030399.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.3805700521878764, "frac_reward_zero_std": 1.0, "grad_norm": 0.012083422805892557, "kl": 0.00102996826171875, "learning_rate": 8.013811767788144e-07, "loss": 0.0, "num_tokens": 5033584.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 645.0, "completions/max_terminated_length": 645.0, "completions/mean_length": 385.25, "completions/mean_terminated_length": 385.25, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.38083768232302956, "frac_reward_zero_std": 0.5, "grad_norm": 0.6445819747939403, "kl": 0.00188446044921875, "learning_rate": 8.01032400613172e-07, "loss": 0.052, "num_tokens": 5037950.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3811053124581828, "frac_reward_zero_std": 1.0, "grad_norm": 0.014903175363028168, "kl": 0.0014190673828125, "learning_rate": 8.00683405381136e-07, "loss": 0.0001, "num_tokens": 5041182.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.381372942593336, "frac_reward_zero_std": 1.0, "grad_norm": 0.044208841099557875, "kl": 0.00250244140625, "learning_rate": 8.003341913872616e-07, "loss": 0.0001, "num_tokens": 5044074.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 354.25, "completions/mean_terminated_length": 354.25, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.3816405727284892, "frac_reward_zero_std": 1.0, "grad_norm": 0.011008591954862496, "kl": 0.0009021759033203125, "learning_rate": 7.999847589362941e-07, "loss": 0.0, "num_tokens": 5047860.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 294.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.38190820286364247, "frac_reward_zero_std": 1.0, "grad_norm": 0.010583166092386494, "kl": 0.0005235671997070312, "learning_rate": 7.996351083331702e-07, "loss": 0.0, "num_tokens": 5051299.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 416.75, "completions/mean_terminated_length": 416.75, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.38217583299879565, "frac_reward_zero_std": 1.0, "grad_norm": 0.01744870613572536, "kl": 0.0016326904296875, "learning_rate": 7.992852398830162e-07, "loss": 0.0001, "num_tokens": 5055705.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.3824434631339489, "frac_reward_zero_std": 0.5, "grad_norm": 1.2601779584066002, "kl": 0.001323699951171875, "learning_rate": 7.989351538911494e-07, "loss": -0.0269, "num_tokens": 5058771.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 444.0, "completions/mean_terminated_length": 444.0, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.3827110932691021, "frac_reward_zero_std": 1.0, "grad_norm": 0.011800559975845025, "kl": 0.00133514404296875, "learning_rate": 7.985848506630761e-07, "loss": 0.0001, "num_tokens": 5063571.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 297.0, "completions/mean_terminated_length": 297.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.3829787234042553, "frac_reward_zero_std": 1.0, "grad_norm": 0.02106776006004605, "kl": 0.001739501953125, "learning_rate": 7.982343305044931e-07, "loss": 0.0001, "num_tokens": 5066975.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 303.75, "completions/mean_terminated_length": 303.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.38324635353940856, "frac_reward_zero_std": 1.0, "grad_norm": 0.017350236006625015, "kl": 0.00173187255859375, "learning_rate": 7.978835937212853e-07, "loss": 0.0001, "num_tokens": 5070453.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 333.375, "completions/mean_terminated_length": 333.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.38351398367456174, "frac_reward_zero_std": 1.0, "grad_norm": 0.018678461785988826, "kl": 0.001575469970703125, "learning_rate": 7.975326406195277e-07, "loss": 0.0001, "num_tokens": 5074276.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.383781613809715, "frac_reward_zero_std": 1.0, "grad_norm": 0.014874709887946333, "kl": 0.00106048583984375, "learning_rate": 7.971814715054836e-07, "loss": 0.0, "num_tokens": 5077743.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 366.375, "completions/mean_terminated_length": 366.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.38404924394486817, "frac_reward_zero_std": 1.0, "grad_norm": 0.007361980675616528, "kl": 0.0006561279296875, "learning_rate": 7.96830086685605e-07, "loss": 0.0, "num_tokens": 5081790.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.3843168740800214, "frac_reward_zero_std": 1.0, "grad_norm": 0.015902416963357265, "kl": 0.0013580322265625, "learning_rate": 7.964784864665317e-07, "loss": 0.0001, "num_tokens": 5085581.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.38458450421517465, "frac_reward_zero_std": 0.5, "grad_norm": 1.2767526861600915, "kl": 0.0016326904296875, "learning_rate": 7.961266711550921e-07, "loss": -0.0665, "num_tokens": 5089641.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 316.75, "completions/mean_terminated_length": 316.75, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.38485213435032783, "frac_reward_zero_std": 1.0, "grad_norm": 0.01143026405034901, "kl": 0.000843048095703125, "learning_rate": 7.957746410583018e-07, "loss": 0.0, "num_tokens": 5093139.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.3851197644854811, "frac_reward_zero_std": 1.0, "grad_norm": 0.025417886000057226, "kl": 0.00197601318359375, "learning_rate": 7.954223964833641e-07, "loss": 0.0001, "num_tokens": 5096652.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 260.0, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.38538739462063426, "frac_reward_zero_std": 1.0, "grad_norm": 0.014478520412801284, "kl": 0.001697540283203125, "learning_rate": 7.950699377376695e-07, "loss": 0.0001, "num_tokens": 5099660.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.3856550247557875, "frac_reward_zero_std": 1.0, "grad_norm": 0.047345345357213076, "kl": 0.002086639404296875, "learning_rate": 7.94717265128795e-07, "loss": 0.0001, "num_tokens": 5102930.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.38592265489094074, "frac_reward_zero_std": 1.0, "grad_norm": 0.030630865085894327, "kl": 0.0020751953125, "learning_rate": 7.943643789645048e-07, "loss": 0.0001, "num_tokens": 5106451.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.3861902850260939, "frac_reward_zero_std": 1.0, "grad_norm": 0.025580301282426124, "kl": 0.0021514892578125, "learning_rate": 7.940112795527491e-07, "loss": 0.0001, "num_tokens": 5109677.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 235.875, "completions/mean_terminated_length": 235.875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.38645791516124717, "frac_reward_zero_std": 0.5, "grad_norm": 0.8822892848836376, "kl": 0.00194549560546875, "learning_rate": 7.936579672016643e-07, "loss": 0.015, "num_tokens": 5112768.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.38672554529640035, "frac_reward_zero_std": 1.0, "grad_norm": 0.017106987003530673, "kl": 0.0016021728515625, "learning_rate": 7.933044422195724e-07, "loss": 0.0001, "num_tokens": 5116103.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.3869931754315536, "frac_reward_zero_std": 1.0, "grad_norm": 0.015080371678598953, "kl": 0.001201629638671875, "learning_rate": 7.929507049149816e-07, "loss": 0.0, "num_tokens": 5119876.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 330.875, "completions/mean_terminated_length": 330.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.38726080556670683, "frac_reward_zero_std": 0.5, "grad_norm": 0.7659331481796156, "kl": 0.0020294189453125, "learning_rate": 7.925967555965847e-07, "loss": -0.0007, "num_tokens": 5123547.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.38752843570186, "frac_reward_zero_std": 1.0, "grad_norm": 0.06059677191633102, "kl": 0.00247955322265625, "learning_rate": 7.922425945732594e-07, "loss": 0.0001, "num_tokens": 5126663.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 348.75, "completions/mean_terminated_length": 348.75, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.38779606583701326, "frac_reward_zero_std": 1.0, "grad_norm": 0.013154469704668364, "kl": 0.00148773193359375, "learning_rate": 7.91888222154069e-07, "loss": 0.0001, "num_tokens": 5130477.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 213.875, "completions/mean_terminated_length": 213.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.38806369597216644, "frac_reward_zero_std": 1.0, "grad_norm": 0.015192917737954711, "kl": 0.001129150390625, "learning_rate": 7.915336386482606e-07, "loss": 0.0, "num_tokens": 5133116.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 257.75, "completions/mean_terminated_length": 257.75, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.3883313261073197, "frac_reward_zero_std": 1.0, "grad_norm": 0.013280054830215533, "kl": 0.0010833740234375, "learning_rate": 7.911788443652658e-07, "loss": 0.0, "num_tokens": 5136238.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 298.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.3885989562424729, "frac_reward_zero_std": 1.0, "grad_norm": 0.014184139198887731, "kl": 0.0016021728515625, "learning_rate": 7.908238396147001e-07, "loss": 0.0001, "num_tokens": 5139564.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.3888665863776261, "frac_reward_zero_std": 1.0, "grad_norm": 0.017428857268024185, "kl": 0.001506805419921875, "learning_rate": 7.904686247063619e-07, "loss": 0.0001, "num_tokens": 5142576.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 297.25, "completions/mean_terminated_length": 297.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.38913421651277935, "frac_reward_zero_std": 1.0, "grad_norm": 0.027073013511701167, "kl": 0.001842498779296875, "learning_rate": 7.901131999502347e-07, "loss": 0.0001, "num_tokens": 5146066.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 338.5, "completions/mean_terminated_length": 338.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.38940184664793254, "frac_reward_zero_std": 0.5, "grad_norm": 0.7669045535276436, "kl": 0.002960205078125, "learning_rate": 7.897575656564835e-07, "loss": -0.001, "num_tokens": 5149930.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.3896694767830858, "frac_reward_zero_std": 1.0, "grad_norm": 0.013438160179152756, "kl": 0.001201629638671875, "learning_rate": 7.894017221354569e-07, "loss": 0.0, "num_tokens": 5153649.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 362.5, "completions/mean_terminated_length": 362.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.389937106918239, "frac_reward_zero_std": 1.0, "grad_norm": 0.015315024941521252, "kl": 0.001209259033203125, "learning_rate": 7.89045669697686e-07, "loss": 0.0, "num_tokens": 5157625.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 390.0, "completions/mean_terminated_length": 390.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.3902047370533922, "frac_reward_zero_std": 0.5, "grad_norm": 1.1310916457821736, "kl": 0.00208282470703125, "learning_rate": 7.88689408653884e-07, "loss": -0.0766, "num_tokens": 5161933.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.39047236718854544, "frac_reward_zero_std": 1.0, "grad_norm": 0.020588131801362618, "kl": 0.001922607421875, "learning_rate": 7.883329393149466e-07, "loss": 0.0001, "num_tokens": 5165207.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.39073999732369863, "frac_reward_zero_std": 1.0, "grad_norm": 0.02691562348834776, "kl": 0.0028533935546875, "learning_rate": 7.879762619919508e-07, "loss": 0.0001, "num_tokens": 5168431.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.39100762745885187, "frac_reward_zero_std": 1.0, "grad_norm": 0.02274886713909099, "kl": 0.0017852783203125, "learning_rate": 7.876193769961554e-07, "loss": 0.0001, "num_tokens": 5171969.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.3912752575940051, "frac_reward_zero_std": 0.5, "grad_norm": 1.0643870088234857, "kl": 0.00257110595703125, "learning_rate": 7.872622846390001e-07, "loss": 0.0283, "num_tokens": 5175281.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.3915428877291583, "frac_reward_zero_std": 1.0, "grad_norm": 0.04058256219426428, "kl": 0.0029296875, "learning_rate": 7.869049852321061e-07, "loss": 0.0001, "num_tokens": 5178450.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.39181051786431154, "frac_reward_zero_std": 1.0, "grad_norm": 0.012541083079726588, "kl": 0.00128936767578125, "learning_rate": 7.865474790872748e-07, "loss": 0.0001, "num_tokens": 5182701.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.3920781479994647, "frac_reward_zero_std": 1.0, "grad_norm": 0.01139035686229909, "kl": 0.0009441375732421875, "learning_rate": 7.861897665164878e-07, "loss": 0.0, "num_tokens": 5186193.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 287.25, "completions/mean_terminated_length": 287.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.39234577813461796, "frac_reward_zero_std": 1.0, "grad_norm": 0.024698941404327032, "kl": 0.002288818359375, "learning_rate": 7.858318478319077e-07, "loss": 0.0001, "num_tokens": 5189491.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.3926134082697712, "frac_reward_zero_std": 1.0, "grad_norm": 0.029642518185163832, "kl": 0.0031280517578125, "learning_rate": 7.854737233458764e-07, "loss": 0.0001, "num_tokens": 5192727.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.3928810384049244, "frac_reward_zero_std": 1.0, "grad_norm": 0.009186411126091533, "kl": 0.000682830810546875, "learning_rate": 7.851153933709151e-07, "loss": 0.0, "num_tokens": 5196024.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 239.25, "completions/mean_terminated_length": 239.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.39314866854007763, "frac_reward_zero_std": 0.5, "grad_norm": 0.7015586371404344, "kl": 0.002899169921875, "learning_rate": 7.847568582197253e-07, "loss": 0.045, "num_tokens": 5198878.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 442.375, "completions/mean_terminated_length": 442.375, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.3934162986752308, "frac_reward_zero_std": 1.0, "grad_norm": 0.01612856621260483, "kl": 0.001789093017578125, "learning_rate": 7.843981182051864e-07, "loss": 0.0001, "num_tokens": 5203809.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.39368392881038405, "frac_reward_zero_std": 1.0, "grad_norm": 0.017559520177021563, "kl": 0.00104522705078125, "learning_rate": 7.840391736403573e-07, "loss": 0.0, "num_tokens": 5207208.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.3939515589455373, "frac_reward_zero_std": 0.5, "grad_norm": 1.0026276710719364, "kl": 0.001583099365234375, "learning_rate": 7.836800248384754e-07, "loss": -0.0338, "num_tokens": 5210934.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 403.5, "completions/mean_terminated_length": 403.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3942191890806905, "frac_reward_zero_std": 1.0, "grad_norm": 0.010361038688684586, "kl": 0.00109100341796875, "learning_rate": 7.833206721129559e-07, "loss": 0.0, "num_tokens": 5215134.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 324.125, "completions/mean_terminated_length": 324.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.3944868192158437, "frac_reward_zero_std": 1.0, "grad_norm": 0.013731254619845023, "kl": 0.001468658447265625, "learning_rate": 7.829611157773922e-07, "loss": 0.0001, "num_tokens": 5218743.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.3947544493509969, "frac_reward_zero_std": 1.0, "grad_norm": 0.01236271116379577, "kl": 0.0013904571533203125, "learning_rate": 7.826013561455557e-07, "loss": 0.0001, "num_tokens": 5222553.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 277.5, "completions/mean_terminated_length": 277.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.39502207948615015, "frac_reward_zero_std": 1.0, "grad_norm": 0.01949516837432246, "kl": 0.001529693603515625, "learning_rate": 7.822413935313945e-07, "loss": 0.0001, "num_tokens": 5225761.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.39528970962130333, "frac_reward_zero_std": 1.0, "grad_norm": 0.012730341023471235, "kl": 0.0006589889526367188, "learning_rate": 7.818812282490345e-07, "loss": 0.0, "num_tokens": 5228752.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 259.875, "completions/mean_terminated_length": 259.875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.39555733975645657, "frac_reward_zero_std": 1.0, "grad_norm": 0.023953182947898975, "kl": 0.002109527587890625, "learning_rate": 7.815208606127781e-07, "loss": 0.0001, "num_tokens": 5231987.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.3958249698916098, "frac_reward_zero_std": 1.0, "grad_norm": 0.015564801686470762, "kl": 0.001247406005859375, "learning_rate": 7.811602909371042e-07, "loss": 0.0, "num_tokens": 5235876.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 352.75, "completions/mean_terminated_length": 352.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.396092600026763, "frac_reward_zero_std": 1.0, "grad_norm": 0.05946924473678608, "kl": 0.00276947021484375, "learning_rate": 7.807995195366687e-07, "loss": 0.0001, "num_tokens": 5239894.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.39636023016191624, "frac_reward_zero_std": 1.0, "grad_norm": 0.014972273652032138, "kl": 0.00135040283203125, "learning_rate": 7.804385467263025e-07, "loss": 0.0001, "num_tokens": 5243214.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 359.125, "completions/mean_terminated_length": 359.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.3966278602970694, "frac_reward_zero_std": 0.5, "grad_norm": 0.9663784656240924, "kl": 0.0020008087158203125, "learning_rate": 7.800773728210132e-07, "loss": 0.0181, "num_tokens": 5247435.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 330.125, "completions/mean_terminated_length": 330.125, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.39689549043222266, "frac_reward_zero_std": 1.0, "grad_norm": 0.03237135067704367, "kl": 0.00234222412109375, "learning_rate": 7.797159981359831e-07, "loss": 0.0001, "num_tokens": 5251072.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3971631205673759, "frac_reward_zero_std": 1.0, "grad_norm": 0.024258401070817146, "kl": 0.00194549560546875, "learning_rate": 7.793544229865702e-07, "loss": 0.0001, "num_tokens": 5253602.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 253.25, "completions/mean_terminated_length": 253.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.3974307507025291, "frac_reward_zero_std": 1.0, "grad_norm": 0.016125839809822885, "kl": 0.001434326171875, "learning_rate": 7.789926476883077e-07, "loss": 0.0001, "num_tokens": 5256528.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 314.0, "completions/mean_terminated_length": 314.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.39769838083768233, "frac_reward_zero_std": 1.0, "grad_norm": 0.015926007371516265, "kl": 0.00229644775390625, "learning_rate": 7.786306725569025e-07, "loss": 0.0001, "num_tokens": 5260200.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 255.25, "completions/mean_terminated_length": 255.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3979660109728355, "frac_reward_zero_std": 1.0, "grad_norm": 0.028455234521229854, "kl": 0.001720428466796875, "learning_rate": 7.78268497908237e-07, "loss": 0.0001, "num_tokens": 5263170.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 273.5, "completions/mean_terminated_length": 273.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.39823364110798876, "frac_reward_zero_std": 1.0, "grad_norm": 0.025735567705558557, "kl": 0.001922607421875, "learning_rate": 7.779061240583669e-07, "loss": 0.0001, "num_tokens": 5266370.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.398501271243142, "frac_reward_zero_std": 1.0, "grad_norm": 0.023721134897524008, "kl": 0.002197265625, "learning_rate": 7.775435513235221e-07, "loss": 0.0001, "num_tokens": 5269725.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.3987689013782952, "frac_reward_zero_std": 1.0, "grad_norm": 0.015140229209140222, "kl": 0.000888824462890625, "learning_rate": 7.77180780020106e-07, "loss": 0.0, "num_tokens": 5272805.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 281.875, "completions/mean_terminated_length": 281.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.3990365315134484, "frac_reward_zero_std": 1.0, "grad_norm": 0.033942560325590815, "kl": 0.002048492431640625, "learning_rate": 7.768178104646952e-07, "loss": 0.0001, "num_tokens": 5276068.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3993041616486016, "frac_reward_zero_std": 1.0, "grad_norm": 0.01713800964271963, "kl": 0.0018310546875, "learning_rate": 7.764546429740394e-07, "loss": 0.0001, "num_tokens": 5279105.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.39957179178375485, "frac_reward_zero_std": 1.0, "grad_norm": 0.013259349438352004, "kl": 0.0011653900146484375, "learning_rate": 7.760912778650611e-07, "loss": 0.0, "num_tokens": 5282774.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.3998394219189081, "frac_reward_zero_std": 1.0, "grad_norm": 0.033123682889975525, "kl": 0.00170135498046875, "learning_rate": 7.757277154548551e-07, "loss": 0.0001, "num_tokens": 5286033.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 260.75, "completions/mean_terminated_length": 260.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.4001070520540613, "frac_reward_zero_std": 1.0, "grad_norm": 0.055014558043744846, "kl": 0.00240325927734375, "learning_rate": 7.753639560606885e-07, "loss": 0.0001, "num_tokens": 5289123.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 261.875, "completions/mean_terminated_length": 261.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.4003746821892145, "frac_reward_zero_std": 1.0, "grad_norm": 0.03754144225998123, "kl": 0.00225830078125, "learning_rate": 7.75e-07, "loss": 0.0001, "num_tokens": 5292226.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 451.75, "completions/mean_terminated_length": 451.75, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.4006423123243677, "frac_reward_zero_std": 1.0, "grad_norm": 0.010703607398468895, "kl": 0.0013637542724609375, "learning_rate": 7.746358475904005e-07, "loss": 0.0001, "num_tokens": 5297072.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.40090994245952094, "frac_reward_zero_std": 0.5, "grad_norm": 0.8489871358211638, "kl": 0.0018463134765625, "learning_rate": 7.742714991496714e-07, "loss": 0.0276, "num_tokens": 5301170.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 415.125, "completions/mean_terminated_length": 415.125, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.4011775725946742, "frac_reward_zero_std": 0.5, "grad_norm": 0.6073400075153469, "kl": 0.001556396484375, "learning_rate": 7.739069549957663e-07, "loss": 0.0562, "num_tokens": 5305659.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.40144520272982737, "frac_reward_zero_std": 1.0, "grad_norm": 0.018868504703904578, "kl": 0.0012359619140625, "learning_rate": 7.735422154468086e-07, "loss": 0.0, "num_tokens": 5308655.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 351.25, "completions/mean_terminated_length": 351.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.4017128328649806, "frac_reward_zero_std": 0.5, "grad_norm": 0.771471022664655, "kl": 0.0011444091796875, "learning_rate": 7.731772808210928e-07, "loss": 0.0794, "num_tokens": 5312469.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.4019804630001338, "frac_reward_zero_std": 1.0, "grad_norm": 0.015279864245231096, "kl": 0.001209259033203125, "learning_rate": 7.728121514370832e-07, "loss": 0.0, "num_tokens": 5315891.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.40224809313528703, "frac_reward_zero_std": 0.5, "grad_norm": 1.284611300259796, "kl": 0.00154876708984375, "learning_rate": 7.724468276134142e-07, "loss": -0.0537, "num_tokens": 5319423.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 361.25, "completions/mean_terminated_length": 361.25, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.4025157232704403, "frac_reward_zero_std": 1.0, "grad_norm": 0.015332989667562358, "kl": 0.00164031982421875, "learning_rate": 7.720813096688903e-07, "loss": 0.0001, "num_tokens": 5323449.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.40278335340559346, "frac_reward_zero_std": 1.0, "grad_norm": 0.023455259296959307, "kl": 0.001491546630859375, "learning_rate": 7.717155979224847e-07, "loss": 0.0001, "num_tokens": 5326915.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.4030509835407467, "frac_reward_zero_std": 1.0, "grad_norm": 0.03714425326352367, "kl": 0.001953125, "learning_rate": 7.713496926933404e-07, "loss": 0.0001, "num_tokens": 5330005.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 314.0, "completions/mean_terminated_length": 314.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.4033186136758999, "frac_reward_zero_std": 1.0, "grad_norm": 0.026198661108948224, "kl": 0.00255584716796875, "learning_rate": 7.709835943007688e-07, "loss": 0.0001, "num_tokens": 5333561.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 360.5, "completions/mean_terminated_length": 360.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.4035862438110531, "frac_reward_zero_std": 0.5, "grad_norm": 0.6955095769457073, "kl": 0.0014190673828125, "learning_rate": 7.706173030642497e-07, "loss": 0.0003, "num_tokens": 5337729.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 327.5, "completions/mean_terminated_length": 327.5, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.40385387394620637, "frac_reward_zero_std": 1.0, "grad_norm": 0.017935988516870376, "kl": 0.0017547607421875, "learning_rate": 7.702508193034319e-07, "loss": 0.0001, "num_tokens": 5341273.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 413.75, "completions/mean_terminated_length": 413.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.40412150408135955, "frac_reward_zero_std": 0.5, "grad_norm": 0.7391701533411602, "kl": 0.003692626953125, "learning_rate": 7.698841433381316e-07, "loss": 0.0437, "num_tokens": 5345523.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 394.375, "completions/mean_terminated_length": 394.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.4043891342165128, "frac_reward_zero_std": 1.0, "grad_norm": 0.019601705108651304, "kl": 0.001739501953125, "learning_rate": 7.695172754883331e-07, "loss": 0.0001, "num_tokens": 5349910.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 336.5, "completions/mean_terminated_length": 336.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.404656764351666, "frac_reward_zero_std": 1.0, "grad_norm": 0.01828582664543044, "kl": 0.001667022705078125, "learning_rate": 7.691502160741877e-07, "loss": 0.0001, "num_tokens": 5353766.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 334.25, "completions/mean_terminated_length": 334.25, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.4049243944868192, "frac_reward_zero_std": 1.0, "grad_norm": 0.021281334065240867, "kl": 0.00536346435546875, "learning_rate": 7.687829654160143e-07, "loss": 0.0002, "num_tokens": 5357532.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 315.75, "completions/mean_terminated_length": 315.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.40519202462197246, "frac_reward_zero_std": 1.0, "grad_norm": 0.022854823994707774, "kl": 0.0023193359375, "learning_rate": 7.684155238342986e-07, "loss": 0.0001, "num_tokens": 5361102.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.40545965475712564, "frac_reward_zero_std": 1.0, "grad_norm": 0.05718849095010574, "kl": 0.002101898193359375, "learning_rate": 7.680478916496926e-07, "loss": 0.0001, "num_tokens": 5363902.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4057272848922789, "frac_reward_zero_std": 1.0, "grad_norm": 0.02664892341037933, "kl": 0.00241851806640625, "learning_rate": 7.67680069183015e-07, "loss": 0.0001, "num_tokens": 5367697.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.40599491502743207, "frac_reward_zero_std": 1.0, "grad_norm": 0.053430379718487415, "kl": 0.0051727294921875, "learning_rate": 7.673120567552504e-07, "loss": 0.0002, "num_tokens": 5371982.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 323.125, "completions/mean_terminated_length": 323.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.4062625451625853, "frac_reward_zero_std": 1.0, "grad_norm": 0.04003501543598559, "kl": 0.00389862060546875, "learning_rate": 7.669438546875493e-07, "loss": 0.0002, "num_tokens": 5375591.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.40653017529773855, "frac_reward_zero_std": 1.0, "grad_norm": 0.015994947496438452, "kl": 0.00119781494140625, "learning_rate": 7.665754633012275e-07, "loss": 0.0, "num_tokens": 5379028.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 481.0, "completions/mean_terminated_length": 481.0, "completions/min_length": 376.0, "completions/min_terminated_length": 376.0, "epoch": 0.40679780543289173, "frac_reward_zero_std": 0.5, "grad_norm": 0.6789661235736879, "kl": 0.001735687255859375, "learning_rate": 7.662068829177661e-07, "loss": -0.0278, "num_tokens": 5383908.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.407065435568045, "frac_reward_zero_std": 1.0, "grad_norm": 0.03656892310124714, "kl": 0.00218963623046875, "learning_rate": 7.658381138588111e-07, "loss": 0.0001, "num_tokens": 5386818.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.40733306570319816, "frac_reward_zero_std": 0.5, "grad_norm": 0.8861378360312004, "kl": 0.008758544921875, "learning_rate": 7.654691564461733e-07, "loss": 0.0065, "num_tokens": 5390827.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.4076006958383514, "frac_reward_zero_std": 1.0, "grad_norm": 0.07637043013662771, "kl": 0.00439453125, "learning_rate": 7.651000110018277e-07, "loss": 0.0002, "num_tokens": 5394167.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 366.375, "completions/mean_terminated_length": 366.375, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.40786832597350464, "frac_reward_zero_std": 1.0, "grad_norm": 0.07967474836213677, "kl": 0.00284576416015625, "learning_rate": 7.647306778479135e-07, "loss": 0.0001, "num_tokens": 5398086.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 347.875, "completions/mean_terminated_length": 347.875, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.4081359561086578, "frac_reward_zero_std": 0.5, "grad_norm": 0.8121279597745539, "kl": 0.00168609619140625, "learning_rate": 7.643611573067335e-07, "loss": 0.047, "num_tokens": 5401933.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.40840358624381107, "frac_reward_zero_std": 1.0, "grad_norm": 0.03676999745112891, "kl": 0.003265380859375, "learning_rate": 7.63991449700754e-07, "loss": 0.0001, "num_tokens": 5405752.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 680.0, "completions/max_terminated_length": 680.0, "completions/mean_length": 321.125, "completions/mean_terminated_length": 321.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.40867121637896425, "frac_reward_zero_std": 1.0, "grad_norm": 0.038931699430821046, "kl": 0.0025634765625, "learning_rate": 7.636215553526053e-07, "loss": 0.0001, "num_tokens": 5409401.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 315.25, "completions/mean_terminated_length": 315.25, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4089388465141175, "frac_reward_zero_std": 0.5, "grad_norm": 2.163411389282609, "kl": 0.00283050537109375, "learning_rate": 7.632514745850794e-07, "loss": -0.0083, "num_tokens": 5412931.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.40920647664927073, "frac_reward_zero_std": 1.0, "grad_norm": 0.009865331678074307, "kl": 0.0006160736083984375, "learning_rate": 7.628812077211321e-07, "loss": 0.0, "num_tokens": 5416461.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.4094741067844239, "frac_reward_zero_std": 1.0, "grad_norm": 0.017713995827228105, "kl": 0.0013885498046875, "learning_rate": 7.625107550838812e-07, "loss": 0.0001, "num_tokens": 5419594.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.40974173691957716, "frac_reward_zero_std": 1.0, "grad_norm": 0.01236744985343632, "kl": 0.001373291015625, "learning_rate": 7.621401169966063e-07, "loss": 0.0001, "num_tokens": 5422922.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 620.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 487.0, "completions/mean_terminated_length": 487.0, "completions/min_length": 406.0, "completions/min_terminated_length": 406.0, "epoch": 0.41000936705473034, "frac_reward_zero_std": 0.5, "grad_norm": 0.8016415929698653, "kl": 0.00196075439453125, "learning_rate": 7.617692937827492e-07, "loss": -0.0031, "num_tokens": 5428422.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 346.75, "completions/mean_terminated_length": 346.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.4102769971898836, "frac_reward_zero_std": 1.0, "grad_norm": 0.010506379639715136, "kl": 0.001071929931640625, "learning_rate": 7.613982857659134e-07, "loss": 0.0, "num_tokens": 5432304.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 390.125, "completions/mean_terminated_length": 390.125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.4105446273250368, "frac_reward_zero_std": 0.5, "grad_norm": 0.5587886277360758, "kl": 0.0016326904296875, "learning_rate": 7.61027093269863e-07, "loss": -0.0278, "num_tokens": 5436525.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.41081225746019, "frac_reward_zero_std": 0.5, "grad_norm": 1.1076983180437714, "kl": 0.00237274169921875, "learning_rate": 7.606557166185237e-07, "loss": 0.0549, "num_tokens": 5439701.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.41107988759534325, "frac_reward_zero_std": 1.0, "grad_norm": 0.016194059282558338, "kl": 0.0013275146484375, "learning_rate": 7.602841561359821e-07, "loss": 0.0001, "num_tokens": 5443451.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 621.0, "completions/max_terminated_length": 621.0, "completions/mean_length": 454.625, "completions/mean_terminated_length": 454.625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.41134751773049644, "frac_reward_zero_std": 1.0, "grad_norm": 0.008889079347377277, "kl": 0.0007038116455078125, "learning_rate": 7.599124121464841e-07, "loss": 0.0, "num_tokens": 5448052.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 192.375, "completions/mean_terminated_length": 192.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.4116151478656497, "frac_reward_zero_std": 1.0, "grad_norm": 0.1500765673515895, "kl": 0.00257110595703125, "learning_rate": 7.595404849744373e-07, "loss": 0.0001, "num_tokens": 5450471.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 289.875, "completions/mean_terminated_length": 289.875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.41188277800080286, "frac_reward_zero_std": 0.5, "grad_norm": 1.2011196215214026, "kl": 0.0013027191162109375, "learning_rate": 7.591683749444077e-07, "loss": 0.0783, "num_tokens": 5453878.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 297.875, "completions/mean_terminated_length": 297.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.4121504081359561, "frac_reward_zero_std": 1.0, "grad_norm": 0.022536121702698135, "kl": 0.001644134521484375, "learning_rate": 7.58796082381122e-07, "loss": 0.0001, "num_tokens": 5457457.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 311.25, "completions/mean_terminated_length": 311.25, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.41241803827110934, "frac_reward_zero_std": 1.0, "grad_norm": 0.01037889168280721, "kl": 0.000728607177734375, "learning_rate": 7.584236076094653e-07, "loss": 0.0, "num_tokens": 5460903.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 406.25, "completions/mean_terminated_length": 406.25, "completions/min_length": 312.0, "completions/min_terminated_length": 312.0, "epoch": 0.41268566840626253, "frac_reward_zero_std": 1.0, "grad_norm": 0.01772416877806647, "kl": 0.001651763916015625, "learning_rate": 7.580509509544823e-07, "loss": 0.0001, "num_tokens": 5465685.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.41295329854141577, "frac_reward_zero_std": 0.5, "grad_norm": 0.6685687250239778, "kl": 0.00106048583984375, "learning_rate": 7.576781127413764e-07, "loss": -0.007, "num_tokens": 5468791.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 400.125, "completions/mean_terminated_length": 400.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.41322092867656895, "frac_reward_zero_std": 1.0, "grad_norm": 0.03480745083376007, "kl": 0.002887725830078125, "learning_rate": 7.57305093295509e-07, "loss": 0.0001, "num_tokens": 5473260.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 283.75, "completions/mean_terminated_length": 283.75, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.4134885588117222, "frac_reward_zero_std": 1.0, "grad_norm": 0.02016422946976677, "kl": 0.00188446044921875, "learning_rate": 7.569318929424001e-07, "loss": 0.0001, "num_tokens": 5476538.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 328.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.41375618894687544, "frac_reward_zero_std": 1.0, "grad_norm": 0.015915699781991106, "kl": 0.00153350830078125, "learning_rate": 7.565585120077273e-07, "loss": 0.0001, "num_tokens": 5480369.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4140238190820286, "frac_reward_zero_std": 1.0, "grad_norm": 0.020414531809352494, "kl": 0.00115203857421875, "learning_rate": 7.56184950817326e-07, "loss": 0.0, "num_tokens": 5483579.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.41429144921718186, "frac_reward_zero_std": 1.0, "grad_norm": 0.034413957961286434, "kl": 0.002532958984375, "learning_rate": 7.558112096971888e-07, "loss": 0.0001, "num_tokens": 5486859.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 298.625, "completions/mean_terminated_length": 298.625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.41455907935233505, "frac_reward_zero_std": 0.5, "grad_norm": 0.8177202958259912, "kl": 0.0042724609375, "learning_rate": 7.554372889734651e-07, "loss": -0.0258, "num_tokens": 5490344.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.4148267094874883, "frac_reward_zero_std": 0.5, "grad_norm": 1.2187412942768039, "kl": 0.0036163330078125, "learning_rate": 7.550631889724613e-07, "loss": 0.0061, "num_tokens": 5494580.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 404.25, "completions/mean_terminated_length": 404.25, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.41509433962264153, "frac_reward_zero_std": 0.5, "grad_norm": 0.5519349817062564, "kl": 0.0017242431640625, "learning_rate": 7.546889100206403e-07, "loss": 0.0128, "num_tokens": 5498910.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4153619697577947, "frac_reward_zero_std": 1.0, "grad_norm": 0.07254577272695824, "kl": 0.00331878662109375, "learning_rate": 7.543144524446214e-07, "loss": 0.0001, "num_tokens": 5502636.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.41562959989294795, "frac_reward_zero_std": 1.0, "grad_norm": 0.0698398925812627, "kl": 0.00384521484375, "learning_rate": 7.539398165711788e-07, "loss": 0.0002, "num_tokens": 5505720.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.41589723002810114, "frac_reward_zero_std": 0.5, "grad_norm": 0.719334561462435, "kl": 0.00243377685546875, "learning_rate": 7.535650027272431e-07, "loss": -0.0025, "num_tokens": 5509341.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.4161648601632544, "frac_reward_zero_std": 0.5, "grad_norm": 0.8392021712489304, "kl": 0.001522064208984375, "learning_rate": 7.531900112399002e-07, "loss": -0.0089, "num_tokens": 5513395.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 272.0, "completions/mean_terminated_length": 272.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4164324902984076, "frac_reward_zero_std": 1.0, "grad_norm": 0.03930147159969516, "kl": 0.002185821533203125, "learning_rate": 7.528148424363907e-07, "loss": 0.0001, "num_tokens": 5516619.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4167001204335608, "frac_reward_zero_std": 0.5, "grad_norm": 0.7979517482300789, "kl": 0.001220703125, "learning_rate": 7.524394966441102e-07, "loss": 0.029, "num_tokens": 5520044.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 216.75, "completions/mean_terminated_length": 216.75, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.41696775056871405, "frac_reward_zero_std": 0.5, "grad_norm": 1.2754560510980557, "kl": 0.001255035400390625, "learning_rate": 7.520639741906087e-07, "loss": 0.0163, "num_tokens": 5522662.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.41723538070386723, "frac_reward_zero_std": 1.0, "grad_norm": 0.017762971908937105, "kl": 0.00164031982421875, "learning_rate": 7.516882754035903e-07, "loss": 0.0001, "num_tokens": 5525464.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 348.0, "completions/mean_terminated_length": 348.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.41750301083902047, "frac_reward_zero_std": 1.0, "grad_norm": 0.014415731964977413, "kl": 0.001163482666015625, "learning_rate": 7.513124006109128e-07, "loss": 0.0, "num_tokens": 5529252.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 296.875, "completions/mean_terminated_length": 296.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.4177706409741737, "frac_reward_zero_std": 1.0, "grad_norm": 0.020977225436533065, "kl": 0.00139617919921875, "learning_rate": 7.509363501405878e-07, "loss": 0.0001, "num_tokens": 5532587.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 333.0, "completions/mean_terminated_length": 333.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.4180382711093269, "frac_reward_zero_std": 1.0, "grad_norm": 0.020710399165425657, "kl": 0.00200653076171875, "learning_rate": 7.505601243207804e-07, "loss": 0.0001, "num_tokens": 5536519.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.41830590124448014, "frac_reward_zero_std": 1.0, "grad_norm": 0.02956381872899783, "kl": 0.00209808349609375, "learning_rate": 7.501837234798083e-07, "loss": 0.0001, "num_tokens": 5539848.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 420.625, "completions/mean_terminated_length": 334.4285888671875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.4185735313796333, "frac_reward_zero_std": 0.5, "grad_norm": 0.5964628845318912, "kl": 0.001201629638671875, "learning_rate": 7.498071479461425e-07, "loss": 0.1494, "num_tokens": 5544329.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.41884116151478656, "frac_reward_zero_std": 0.5, "grad_norm": 0.8673328992414503, "kl": 0.00131988525390625, "learning_rate": 7.494303980484056e-07, "loss": 0.0591, "num_tokens": 5547958.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4191087916499398, "frac_reward_zero_std": 1.0, "grad_norm": 0.011396646690163968, "kl": 0.001277923583984375, "learning_rate": 7.490534741153732e-07, "loss": 0.0001, "num_tokens": 5551786.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.419376421785093, "frac_reward_zero_std": 1.0, "grad_norm": 0.02058379851255583, "kl": 0.0013885498046875, "learning_rate": 7.486763764759722e-07, "loss": 0.0001, "num_tokens": 5554779.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 359.875, "completions/mean_terminated_length": 359.875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.41964405192024623, "frac_reward_zero_std": 1.0, "grad_norm": 0.016599811192470948, "kl": 0.00140380859375, "learning_rate": 7.482991054592814e-07, "loss": 0.0001, "num_tokens": 5559006.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 386.625, "completions/mean_terminated_length": 386.625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.4199116820553994, "frac_reward_zero_std": 1.0, "grad_norm": 0.009905382693344011, "kl": 0.000934600830078125, "learning_rate": 7.47921661394531e-07, "loss": 0.0, "num_tokens": 5563283.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.42017931219055266, "frac_reward_zero_std": 1.0, "grad_norm": 0.021140641670120298, "kl": 0.00229644775390625, "learning_rate": 7.47544044611102e-07, "loss": 0.0001, "num_tokens": 5566511.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.4204469423257059, "frac_reward_zero_std": 1.0, "grad_norm": 0.025001015398311042, "kl": 0.001430511474609375, "learning_rate": 7.471662554385258e-07, "loss": 0.0001, "num_tokens": 5569970.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.4207145724608591, "frac_reward_zero_std": 1.0, "grad_norm": 0.017849896994287254, "kl": 0.00179290771484375, "learning_rate": 7.467882942064849e-07, "loss": 0.0001, "num_tokens": 5573027.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.4209822025960123, "frac_reward_zero_std": 1.0, "grad_norm": 0.01783244488194405, "kl": 0.001728057861328125, "learning_rate": 7.464101612448116e-07, "loss": 0.0001, "num_tokens": 5576280.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 227.375, "completions/mean_terminated_length": 227.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4212498327311655, "frac_reward_zero_std": 1.0, "grad_norm": 0.030261184697419046, "kl": 0.00237274169921875, "learning_rate": 7.460318568834881e-07, "loss": 0.0001, "num_tokens": 5578971.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 264.5, "completions/mean_terminated_length": 264.5, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.42151746286631875, "frac_reward_zero_std": 1.0, "grad_norm": 0.020044631977457327, "kl": 0.000942230224609375, "learning_rate": 7.456533814526459e-07, "loss": 0.0, "num_tokens": 5582131.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 328.875, "completions/mean_terminated_length": 328.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.421785093001472, "frac_reward_zero_std": 1.0, "grad_norm": 0.033788112553139786, "kl": 0.00341796875, "learning_rate": 7.452747352825666e-07, "loss": 0.0001, "num_tokens": 5585850.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.4220527231366252, "frac_reward_zero_std": 1.0, "grad_norm": 0.018166224433360266, "kl": 0.00122833251953125, "learning_rate": 7.448959187036798e-07, "loss": 0.0, "num_tokens": 5589247.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 357.375, "completions/mean_terminated_length": 357.375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.4223203532717784, "frac_reward_zero_std": 1.0, "grad_norm": 0.012338188184343186, "kl": 0.001697540283203125, "learning_rate": 7.445169320465644e-07, "loss": 0.0001, "num_tokens": 5593386.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 245.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.4225879834069316, "frac_reward_zero_std": 1.0, "grad_norm": 0.020464466828568978, "kl": 0.0017852783203125, "learning_rate": 7.441377756419476e-07, "loss": 0.0001, "num_tokens": 5596290.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 354.375, "completions/mean_terminated_length": 354.375, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.42285561354208484, "frac_reward_zero_std": 1.0, "grad_norm": 0.01289961778490145, "kl": 0.001110076904296875, "learning_rate": 7.437584498207048e-07, "loss": 0.0, "num_tokens": 5600325.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.4231232436772381, "frac_reward_zero_std": 1.0, "grad_norm": 0.022603504937323677, "kl": 0.002597808837890625, "learning_rate": 7.433789549138591e-07, "loss": 0.0001, "num_tokens": 5603515.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 392.75, "completions/mean_terminated_length": 392.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.42339087381239127, "frac_reward_zero_std": 0.5, "grad_norm": 0.9853927736329102, "kl": 0.001781463623046875, "learning_rate": 7.429992912525813e-07, "loss": 0.0001, "num_tokens": 5607997.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 224.375, "completions/mean_terminated_length": 224.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.4236585039475445, "frac_reward_zero_std": 1.0, "grad_norm": 0.014675498319963428, "kl": 0.001041412353515625, "learning_rate": 7.426194591681893e-07, "loss": 0.0, "num_tokens": 5610764.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.4239261340826977, "frac_reward_zero_std": 1.0, "grad_norm": 0.01468557278141142, "kl": 0.001010894775390625, "learning_rate": 7.42239458992148e-07, "loss": 0.0, "num_tokens": 5614075.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 282.0, "completions/mean_terminated_length": 282.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.42419376421785093, "frac_reward_zero_std": 1.0, "grad_norm": 0.017311864565249756, "kl": 0.001445770263671875, "learning_rate": 7.418592910560689e-07, "loss": 0.0001, "num_tokens": 5617439.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 188.0, "completions/mean_terminated_length": 188.0, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.4244613943530042, "frac_reward_zero_std": 1.0, "grad_norm": 0.010780652982669359, "kl": 0.0008087158203125, "learning_rate": 7.414789556917104e-07, "loss": 0.0, "num_tokens": 5619835.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.42472902448815736, "frac_reward_zero_std": 1.0, "grad_norm": 0.07168329216992467, "kl": 0.0040130615234375, "learning_rate": 7.410984532309767e-07, "loss": 0.0002, "num_tokens": 5622557.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 239.25, "completions/mean_terminated_length": 239.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4249966546233106, "frac_reward_zero_std": 1.0, "grad_norm": 0.025801552766414507, "kl": 0.001857757568359375, "learning_rate": 7.407177840059174e-07, "loss": 0.0001, "num_tokens": 5625511.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 335.25, "completions/mean_terminated_length": 335.25, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.4252642847584638, "frac_reward_zero_std": 1.0, "grad_norm": 0.013445201014243256, "kl": 0.001708984375, "learning_rate": 7.403369483487282e-07, "loss": 0.0001, "num_tokens": 5629313.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.425531914893617, "frac_reward_zero_std": 0.5, "grad_norm": 0.9643699536013896, "kl": 0.0031280517578125, "learning_rate": 7.399559465917499e-07, "loss": -0.1009, "num_tokens": 5632633.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 254.25, "completions/mean_terminated_length": 254.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.42579954502877027, "frac_reward_zero_std": 1.0, "grad_norm": 0.02113437928498294, "kl": 0.0017242431640625, "learning_rate": 7.39574779067468e-07, "loss": 0.0001, "num_tokens": 5635671.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.42606717516392345, "frac_reward_zero_std": 1.0, "grad_norm": 0.018883888278919632, "kl": 0.00208282470703125, "learning_rate": 7.39193446108513e-07, "loss": 0.0001, "num_tokens": 5639240.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.4263348052990767, "frac_reward_zero_std": 0.5, "grad_norm": 0.8125139179091195, "kl": 0.0016632080078125, "learning_rate": 7.388119480476596e-07, "loss": -0.0461, "num_tokens": 5642190.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.4266024354342299, "frac_reward_zero_std": 1.0, "grad_norm": 0.027482340676832633, "kl": 0.0025482177734375, "learning_rate": 7.384302852178268e-07, "loss": 0.0001, "num_tokens": 5644819.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4268700655693831, "frac_reward_zero_std": 1.0, "grad_norm": 0.015693188571864348, "kl": 0.00156402587890625, "learning_rate": 7.380484579520766e-07, "loss": 0.0001, "num_tokens": 5648565.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.42713769570453636, "frac_reward_zero_std": 1.0, "grad_norm": 0.022472403563429853, "kl": 0.0017852783203125, "learning_rate": 7.376664665836155e-07, "loss": 0.0001, "num_tokens": 5652336.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 738.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.42740532583968954, "frac_reward_zero_std": 1.0, "grad_norm": 0.020135644460702373, "kl": 0.00232696533203125, "learning_rate": 7.372843114457929e-07, "loss": 0.0001, "num_tokens": 5656627.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 336.875, "completions/mean_terminated_length": 336.875, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.4276729559748428, "frac_reward_zero_std": 1.0, "grad_norm": 0.024196677101352836, "kl": 0.00148773193359375, "learning_rate": 7.369019928721008e-07, "loss": 0.0001, "num_tokens": 5660482.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 235.625, "completions/mean_terminated_length": 235.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.42794058610999597, "frac_reward_zero_std": 1.0, "grad_norm": 0.015332258399517816, "kl": 0.001270294189453125, "learning_rate": 7.365195111961741e-07, "loss": 0.0001, "num_tokens": 5663587.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.4282082162451492, "frac_reward_zero_std": 1.0, "grad_norm": 0.16543658423620236, "kl": 0.0034637451171875, "learning_rate": 7.361368667517896e-07, "loss": 0.0001, "num_tokens": 5667053.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.42847584638030245, "frac_reward_zero_std": 1.0, "grad_norm": 0.06784953464893712, "kl": 0.00284576416015625, "learning_rate": 7.35754059872867e-07, "loss": 0.0001, "num_tokens": 5670416.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.42874347651545563, "frac_reward_zero_std": 1.0, "grad_norm": 0.023632950770115475, "kl": 0.00231170654296875, "learning_rate": 7.35371090893467e-07, "loss": 0.0001, "num_tokens": 5673601.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4290111066506089, "frac_reward_zero_std": 1.0, "grad_norm": 0.02321326283614933, "kl": 0.00217437744140625, "learning_rate": 7.349879601477923e-07, "loss": 0.0001, "num_tokens": 5677710.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.42927873678576206, "frac_reward_zero_std": 1.0, "grad_norm": 0.027648331856927635, "kl": 0.0023956298828125, "learning_rate": 7.34604667970186e-07, "loss": 0.0001, "num_tokens": 5681554.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.4295463669209153, "frac_reward_zero_std": 1.0, "grad_norm": 0.012406831594406764, "kl": 0.001255035400390625, "learning_rate": 7.342212146951329e-07, "loss": 0.0001, "num_tokens": 5684961.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 332.125, "completions/mean_terminated_length": 332.125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.4298139970560685, "frac_reward_zero_std": 1.0, "grad_norm": 0.04545351078133834, "kl": 0.00258636474609375, "learning_rate": 7.338376006572579e-07, "loss": 0.0001, "num_tokens": 5688694.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 411.75, "completions/mean_terminated_length": 411.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.4300816271912217, "frac_reward_zero_std": 1.0, "grad_norm": 0.012171834303515264, "kl": 0.0013561248779296875, "learning_rate": 7.334538261913264e-07, "loss": 0.0001, "num_tokens": 5693236.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.43034925732637497, "frac_reward_zero_std": 1.0, "grad_norm": 0.016217829782340477, "kl": 0.001373291015625, "learning_rate": 7.330698916322436e-07, "loss": 0.0001, "num_tokens": 5696016.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 302.125, "completions/mean_terminated_length": 302.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.43061688746152815, "frac_reward_zero_std": 0.5, "grad_norm": 1.9137451490361894, "kl": 0.0037994384765625, "learning_rate": 7.326857973150548e-07, "loss": 0.014, "num_tokens": 5699413.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4308845175966814, "frac_reward_zero_std": 1.0, "grad_norm": 0.017369064027951974, "kl": 0.001148223876953125, "learning_rate": 7.323015435749441e-07, "loss": 0.0, "num_tokens": 5702207.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.4311521477318346, "frac_reward_zero_std": 1.0, "grad_norm": 0.013351634138917743, "kl": 0.001377105712890625, "learning_rate": 7.319171307472353e-07, "loss": 0.0001, "num_tokens": 5705555.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 189.375, "completions/mean_terminated_length": 189.375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.4314197778669878, "frac_reward_zero_std": 1.0, "grad_norm": 0.01731731356737834, "kl": 0.0011138916015625, "learning_rate": 7.315325591673907e-07, "loss": 0.0, "num_tokens": 5707978.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 255.375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.43168740800214106, "frac_reward_zero_std": 1.0, "grad_norm": 0.025853261712983574, "kl": 0.0018768310546875, "learning_rate": 7.311478291710112e-07, "loss": 0.0001, "num_tokens": 5711169.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.43195503813729424, "frac_reward_zero_std": 1.0, "grad_norm": 0.017538657930295617, "kl": 0.001163482666015625, "learning_rate": 7.307629410938363e-07, "loss": 0.0, "num_tokens": 5714337.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 319.25, "completions/mean_terminated_length": 319.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.4322226682724475, "frac_reward_zero_std": 1.0, "grad_norm": 0.024958021379928027, "kl": 0.00286102294921875, "learning_rate": 7.303778952717428e-07, "loss": 0.0001, "num_tokens": 5717979.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.43249029840760067, "frac_reward_zero_std": 1.0, "grad_norm": 0.05320312649122515, "kl": 0.00391387939453125, "learning_rate": 7.299926920407454e-07, "loss": 0.0002, "num_tokens": 5720862.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4327579285427539, "frac_reward_zero_std": 1.0, "grad_norm": 0.01332181031494312, "kl": 0.001277923583984375, "learning_rate": 7.296073317369966e-07, "loss": 0.0001, "num_tokens": 5724584.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.43302555867790715, "frac_reward_zero_std": 1.0, "grad_norm": 0.011273376901737743, "kl": 0.000820159912109375, "learning_rate": 7.292218146967855e-07, "loss": 0.0, "num_tokens": 5728350.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 317.625, "completions/mean_terminated_length": 317.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.43329318881306034, "frac_reward_zero_std": 1.0, "grad_norm": 0.023695948406780028, "kl": 0.00218963623046875, "learning_rate": 7.288361412565379e-07, "loss": 0.0001, "num_tokens": 5731939.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 420.75, "completions/mean_terminated_length": 334.5714416503906, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.4335608189482136, "frac_reward_zero_std": 0.5, "grad_norm": 0.8881058825565062, "kl": 0.001445770263671875, "learning_rate": 7.284503117528166e-07, "loss": 0.1271, "num_tokens": 5736305.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 387.5, "completions/mean_terminated_length": 387.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.43382844908336676, "frac_reward_zero_std": 1.0, "grad_norm": 0.01686451600456983, "kl": 0.00201416015625, "learning_rate": 7.280643265223202e-07, "loss": 0.0001, "num_tokens": 5740493.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 235.625, "completions/mean_terminated_length": 235.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.43409607921852, "frac_reward_zero_std": 1.0, "grad_norm": 0.01235501412078311, "kl": 0.0006866455078125, "learning_rate": 7.276781859018831e-07, "loss": 0.0, "num_tokens": 5743362.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 348.375, "completions/mean_terminated_length": 348.375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.43436370935367324, "frac_reward_zero_std": 1.0, "grad_norm": 0.02655941582841897, "kl": 0.0011749267578125, "learning_rate": 7.272918902284757e-07, "loss": 0.0, "num_tokens": 5747237.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.43463133948882643, "frac_reward_zero_std": 1.0, "grad_norm": 0.014098123558651001, "kl": 0.001068115234375, "learning_rate": 7.269054398392033e-07, "loss": 0.0, "num_tokens": 5750611.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 477.25, "completions/mean_terminated_length": 477.25, "completions/min_length": 385.0, "completions/min_terminated_length": 385.0, "epoch": 0.43489896962397967, "frac_reward_zero_std": 0.5, "grad_norm": 0.6419811457429401, "kl": 0.001117706298828125, "learning_rate": 7.265188350713065e-07, "loss": -0.0108, "num_tokens": 5755517.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.43516659975913285, "frac_reward_zero_std": 0.5, "grad_norm": 0.7891896663537187, "kl": 0.00139617919921875, "learning_rate": 7.261320762621604e-07, "loss": -0.0107, "num_tokens": 5758709.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 248.5, "completions/mean_terminated_length": 248.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.4354342298942861, "frac_reward_zero_std": 1.0, "grad_norm": 0.03010861242101872, "kl": 0.001682281494140625, "learning_rate": 7.25745163749275e-07, "loss": 0.0001, "num_tokens": 5761561.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 265.75, "completions/mean_terminated_length": 265.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.43570186002943934, "frac_reward_zero_std": 1.0, "grad_norm": 0.04166438022153356, "kl": 0.00445556640625, "learning_rate": 7.253580978702938e-07, "loss": 0.0002, "num_tokens": 5764883.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 413.375, "completions/mean_terminated_length": 413.375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.4359694901645925, "frac_reward_zero_std": 1.0, "grad_norm": 0.015033118066834402, "kl": 0.001750946044921875, "learning_rate": 7.249708789629944e-07, "loss": 0.0001, "num_tokens": 5769158.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 261.125, "completions/mean_terminated_length": 261.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.43623712029974576, "frac_reward_zero_std": 1.0, "grad_norm": 0.01802902495786702, "kl": 0.00164794921875, "learning_rate": 7.245835073652878e-07, "loss": 0.0001, "num_tokens": 5772319.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.43650475043489895, "frac_reward_zero_std": 1.0, "grad_norm": 0.017859712628046844, "kl": 0.0021209716796875, "learning_rate": 7.24195983415219e-07, "loss": 0.0001, "num_tokens": 5776184.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.4367723805700522, "frac_reward_zero_std": 1.0, "grad_norm": 0.0202489313516729, "kl": 0.002117156982421875, "learning_rate": 7.238083074509647e-07, "loss": 0.0001, "num_tokens": 5779221.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 244.25, "completions/mean_terminated_length": 244.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.43704001070520543, "frac_reward_zero_std": 1.0, "grad_norm": 0.06904720407344278, "kl": 0.00571441650390625, "learning_rate": 7.234204798108352e-07, "loss": 0.0002, "num_tokens": 5782199.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 448.625, "completions/mean_terminated_length": 366.4285888671875, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.4373076408403586, "frac_reward_zero_std": 0.0, "grad_norm": 0.9146854905261514, "kl": 0.0032806396484375, "learning_rate": 7.230325008332731e-07, "loss": 0.2158, "num_tokens": 5786968.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 297.875, "completions/mean_terminated_length": 297.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.43757527097551185, "frac_reward_zero_std": 0.5, "grad_norm": 0.6798569950309266, "kl": 0.00237274169921875, "learning_rate": 7.226443708568524e-07, "loss": -0.0549, "num_tokens": 5790539.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.43784290111066504, "frac_reward_zero_std": 0.5, "grad_norm": 0.7823522051968057, "kl": 0.00264739990234375, "learning_rate": 7.222560902202795e-07, "loss": 0.0367, "num_tokens": 5795043.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 376.0, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.4381105312458183, "frac_reward_zero_std": 0.5, "grad_norm": 0.9367528990332736, "kl": 0.0033111572265625, "learning_rate": 7.218676592623923e-07, "loss": -0.0114, "num_tokens": 5799387.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4383781613809715, "frac_reward_zero_std": 1.0, "grad_norm": 0.046531734938952136, "kl": 0.00396728515625, "learning_rate": 7.214790783221595e-07, "loss": 0.0002, "num_tokens": 5802534.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.4386457915161247, "frac_reward_zero_std": 1.0, "grad_norm": 0.021061280396621675, "kl": 0.00266265869140625, "learning_rate": 7.210903477386807e-07, "loss": 0.0001, "num_tokens": 5806297.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 344.25, "completions/mean_terminated_length": 344.25, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.43891342165127795, "frac_reward_zero_std": 1.0, "grad_norm": 0.029710015631960688, "kl": 0.002391815185546875, "learning_rate": 7.207014678511867e-07, "loss": 0.0001, "num_tokens": 5810067.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.43918105178643113, "frac_reward_zero_std": 1.0, "grad_norm": 0.14100334906419806, "kl": 0.006805419921875, "learning_rate": 7.203124389990375e-07, "loss": 0.0003, "num_tokens": 5814702.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 413.25, "completions/mean_terminated_length": 413.25, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.43944868192158437, "frac_reward_zero_std": 1.0, "grad_norm": 0.05505880290298231, "kl": 0.006183624267578125, "learning_rate": 7.199232615217241e-07, "loss": 0.0002, "num_tokens": 5819616.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 241.0, "completions/mean_terminated_length": 241.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.4397163120567376, "frac_reward_zero_std": 1.0, "grad_norm": 0.01847780279358232, "kl": 0.003753662109375, "learning_rate": 7.195339357588669e-07, "loss": 0.0002, "num_tokens": 5822468.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.4399839421918908, "frac_reward_zero_std": 1.0, "grad_norm": 0.010507689140162578, "kl": 0.00127410888671875, "learning_rate": 7.191444620502152e-07, "loss": 0.0001, "num_tokens": 5825989.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 353.75, "completions/mean_terminated_length": 353.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.44025157232704404, "frac_reward_zero_std": 0.5, "grad_norm": 0.9613628903443184, "kl": 0.0036468505859375, "learning_rate": 7.187548407356484e-07, "loss": 0.0344, "num_tokens": 5829803.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 402.875, "completions/mean_terminated_length": 402.875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.4405192024621972, "frac_reward_zero_std": 1.0, "grad_norm": 0.01991709858439099, "kl": 0.001903533935546875, "learning_rate": 7.183650721551739e-07, "loss": 0.0001, "num_tokens": 5834146.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 375.25, "completions/mean_terminated_length": 375.25, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.44078683259735046, "frac_reward_zero_std": 1.0, "grad_norm": 0.0135816363769179, "kl": 0.0011463165283203125, "learning_rate": 7.17975156648928e-07, "loss": 0.0, "num_tokens": 5838108.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4410544627325037, "frac_reward_zero_std": 1.0, "grad_norm": 0.04420795249546349, "kl": 0.00506591796875, "learning_rate": 7.175850945571746e-07, "loss": 0.0002, "num_tokens": 5841372.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 465.0, "completions/mean_terminated_length": 465.0, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.4413220928676569, "frac_reward_zero_std": 0.5, "grad_norm": 0.9973514601291164, "kl": 0.003936767578125, "learning_rate": 7.171948862203066e-07, "loss": -0.0146, "num_tokens": 5846332.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 388.875, "completions/mean_terminated_length": 388.875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.44158972300281013, "frac_reward_zero_std": 0.5, "grad_norm": 0.6961027115201762, "kl": 0.006256103515625, "learning_rate": 7.168045319788435e-07, "loss": 0.0991, "num_tokens": 5850451.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4418573531379633, "frac_reward_zero_std": 1.0, "grad_norm": 0.012193530928667744, "kl": 0.001537322998046875, "learning_rate": 7.164140321734328e-07, "loss": 0.0001, "num_tokens": 5853992.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 250.75, "completions/mean_terminated_length": 250.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.44212498327311656, "frac_reward_zero_std": 1.0, "grad_norm": 0.04771920377016154, "kl": 0.00384521484375, "learning_rate": 7.160233871448487e-07, "loss": 0.0002, "num_tokens": 5856918.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.4423926134082698, "frac_reward_zero_std": 1.0, "grad_norm": 0.10635010764863577, "kl": 0.0055694580078125, "learning_rate": 7.15632597233992e-07, "loss": 0.0002, "num_tokens": 5860159.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.442660243543423, "frac_reward_zero_std": 1.0, "grad_norm": 0.02880758535873872, "kl": 0.0028228759765625, "learning_rate": 7.152416627818905e-07, "loss": 0.0001, "num_tokens": 5862798.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4429278736785762, "frac_reward_zero_std": 1.0, "grad_norm": 0.03505752259312084, "kl": 0.0024566650390625, "learning_rate": 7.148505841296974e-07, "loss": 0.0001, "num_tokens": 5866038.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 255.125, "completions/mean_terminated_length": 255.125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.4431955038137294, "frac_reward_zero_std": 1.0, "grad_norm": 0.041090098396111684, "kl": 0.00262451171875, "learning_rate": 7.144593616186923e-07, "loss": 0.0001, "num_tokens": 5869087.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 456.375, "completions/mean_terminated_length": 456.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.44346313394888265, "frac_reward_zero_std": 1.0, "grad_norm": 0.013534529713979539, "kl": 0.001590728759765625, "learning_rate": 7.140679955902803e-07, "loss": 0.0001, "num_tokens": 5873890.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.4437307640840359, "frac_reward_zero_std": 1.0, "grad_norm": 0.07769485134069738, "kl": 0.007110595703125, "learning_rate": 7.136764863859914e-07, "loss": 0.0003, "num_tokens": 5876791.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 449.25, "completions/mean_terminated_length": 449.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.4439983942191891, "frac_reward_zero_std": 0.5, "grad_norm": 0.7846810385610532, "kl": 0.002262115478515625, "learning_rate": 7.132848343474809e-07, "loss": 0.0143, "num_tokens": 5881793.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.4442660243543423, "frac_reward_zero_std": 1.0, "grad_norm": 0.04803690116010133, "kl": 0.00446319580078125, "learning_rate": 7.128930398165283e-07, "loss": 0.0002, "num_tokens": 5885152.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.4445336544894955, "frac_reward_zero_std": 1.0, "grad_norm": 0.016724662509214522, "kl": 0.001495361328125, "learning_rate": 7.125011031350378e-07, "loss": 0.0001, "num_tokens": 5888193.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 498.0, "completions/mean_terminated_length": 498.0, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.44480128462464874, "frac_reward_zero_std": 1.0, "grad_norm": 0.022588492458443812, "kl": 0.00211334228515625, "learning_rate": 7.12109024645038e-07, "loss": 0.0001, "num_tokens": 5893381.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.445068914759802, "frac_reward_zero_std": 1.0, "grad_norm": 0.017169102165138367, "kl": 0.0019989013671875, "learning_rate": 7.117168046886802e-07, "loss": 0.0001, "num_tokens": 5896585.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 351.75, "completions/mean_terminated_length": 351.75, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.44533654489495517, "frac_reward_zero_std": 0.5, "grad_norm": 0.6972290301649886, "kl": 0.002002716064453125, "learning_rate": 7.113244436082404e-07, "loss": 0.0071, "num_tokens": 5900427.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.4456041750301084, "frac_reward_zero_std": 1.0, "grad_norm": 0.035514758748706, "kl": 0.002349853515625, "learning_rate": 7.109319417461166e-07, "loss": 0.0001, "num_tokens": 5903604.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 406.0, "completions/mean_terminated_length": 406.0, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.4458718051652616, "frac_reward_zero_std": 1.0, "grad_norm": 0.018492215552945373, "kl": 0.002349853515625, "learning_rate": 7.105392994448305e-07, "loss": 0.0001, "num_tokens": 5908044.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.44613943530041483, "frac_reward_zero_std": 0.5, "grad_norm": 1.3092358233908612, "kl": 0.004180908203125, "learning_rate": 7.101465170470258e-07, "loss": 0.0398, "num_tokens": 5911721.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 359.5, "completions/mean_terminated_length": 359.5, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.446407065435568, "frac_reward_zero_std": 0.5, "grad_norm": 1.1860231668924146, "kl": 0.00213623046875, "learning_rate": 7.09753594895469e-07, "loss": 0.0358, "num_tokens": 5915545.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 372.0, "completions/mean_terminated_length": 372.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.44667469557072126, "frac_reward_zero_std": 1.0, "grad_norm": 0.013135897970716518, "kl": 0.00110626220703125, "learning_rate": 7.093605333330484e-07, "loss": 0.0, "num_tokens": 5919721.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 265.875, "completions/mean_terminated_length": 265.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4469423257058745, "frac_reward_zero_std": 1.0, "grad_norm": 0.00855596839924567, "kl": 0.0006799697875976562, "learning_rate": 7.089673327027733e-07, "loss": 0.0, "num_tokens": 5922828.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.4472099558410277, "frac_reward_zero_std": 1.0, "grad_norm": 0.02386714946782478, "kl": 0.00342559814453125, "learning_rate": 7.085739933477751e-07, "loss": 0.0001, "num_tokens": 5925737.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.4474775859761809, "frac_reward_zero_std": 1.0, "grad_norm": 0.014209570078598103, "kl": 0.001384735107421875, "learning_rate": 7.081805156113062e-07, "loss": 0.0001, "num_tokens": 5928534.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.4477452161113341, "frac_reward_zero_std": 1.0, "grad_norm": 0.013589390333289871, "kl": 0.00141143798828125, "learning_rate": 7.077868998367394e-07, "loss": 0.0001, "num_tokens": 5931864.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.44801284624648735, "frac_reward_zero_std": 0.5, "grad_norm": 1.127932866435014, "kl": 0.002414703369140625, "learning_rate": 7.073931463675684e-07, "loss": -0.0047, "num_tokens": 5935585.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4482804763816406, "frac_reward_zero_std": 1.0, "grad_norm": 0.020832878445256046, "kl": 0.00193023681640625, "learning_rate": 7.069992555474066e-07, "loss": 0.0001, "num_tokens": 5939077.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.4485481065167938, "frac_reward_zero_std": 1.0, "grad_norm": 0.016103591602264104, "kl": 0.0011138916015625, "learning_rate": 7.066052277199875e-07, "loss": 0.0, "num_tokens": 5942534.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 210.625, "completions/mean_terminated_length": 210.625, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.448815736651947, "frac_reward_zero_std": 1.0, "grad_norm": 0.010683403228326754, "kl": 0.000736236572265625, "learning_rate": 7.06211063229164e-07, "loss": 0.0, "num_tokens": 5945299.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.4490833667871002, "frac_reward_zero_std": 0.5, "grad_norm": 0.7521276839076597, "kl": 0.00234222412109375, "learning_rate": 7.058167624189088e-07, "loss": 0.0152, "num_tokens": 5949025.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 411.875, "completions/mean_terminated_length": 411.875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.44935099692225344, "frac_reward_zero_std": 1.0, "grad_norm": 0.01364811876131785, "kl": 0.00110626220703125, "learning_rate": 7.054223256333129e-07, "loss": 0.0, "num_tokens": 5953580.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.4496186270574067, "frac_reward_zero_std": 1.0, "grad_norm": 0.02096857090156322, "kl": 0.002765655517578125, "learning_rate": 7.050277532165864e-07, "loss": 0.0001, "num_tokens": 5956860.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 391.5, "completions/mean_terminated_length": 391.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.44988625719255987, "frac_reward_zero_std": 0.5, "grad_norm": 0.7222554081408509, "kl": 0.00171661376953125, "learning_rate": 7.046330455130576e-07, "loss": 0.0496, "num_tokens": 5961232.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 403.0, "completions/mean_terminated_length": 403.0, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.4501538873277131, "frac_reward_zero_std": 0.5, "grad_norm": 0.7026408197104999, "kl": 0.002132415771484375, "learning_rate": 7.042382028671725e-07, "loss": 0.0374, "num_tokens": 5965772.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 337.0, "completions/mean_terminated_length": 337.0, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4504215174628663, "frac_reward_zero_std": 1.0, "grad_norm": 0.009822173092889032, "kl": 0.00103759765625, "learning_rate": 7.038432256234955e-07, "loss": 0.0, "num_tokens": 5969424.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.45068914759801953, "frac_reward_zero_std": 1.0, "grad_norm": 0.013627361097772255, "kl": 0.00113677978515625, "learning_rate": 7.034481141267079e-07, "loss": 0.0, "num_tokens": 5972978.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 225.25, "completions/mean_terminated_length": 225.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.4509567777331728, "frac_reward_zero_std": 1.0, "grad_norm": 0.02276409264316328, "kl": 0.00244140625, "learning_rate": 7.030528687216088e-07, "loss": 0.0001, "num_tokens": 5975804.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 249.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.45122440786832596, "frac_reward_zero_std": 1.0, "grad_norm": 0.01689405336469268, "kl": 0.000919342041015625, "learning_rate": 7.026574897531135e-07, "loss": 0.0, "num_tokens": 5978899.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4514920380034792, "frac_reward_zero_std": 1.0, "grad_norm": 0.029239438876792742, "kl": 0.00179290771484375, "learning_rate": 7.022619775662544e-07, "loss": 0.0001, "num_tokens": 5981876.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4517596681386324, "frac_reward_zero_std": 1.0, "grad_norm": 0.07078064169218325, "kl": 0.00579833984375, "learning_rate": 7.018663325061796e-07, "loss": 0.0002, "num_tokens": 5985202.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4520272982737856, "frac_reward_zero_std": 1.0, "grad_norm": 0.07388757984341446, "kl": 0.0037078857421875, "learning_rate": 7.014705549181535e-07, "loss": 0.0001, "num_tokens": 5988784.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 293.875, "completions/mean_terminated_length": 293.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.45229492840893887, "frac_reward_zero_std": 0.5, "grad_norm": 0.7659966917298997, "kl": 0.00385284423828125, "learning_rate": 7.010746451475562e-07, "loss": -0.0004, "num_tokens": 5992355.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 287.375, "completions/mean_terminated_length": 287.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.45256255854409205, "frac_reward_zero_std": 0.5, "grad_norm": 0.9741417193062853, "kl": 0.001739501953125, "learning_rate": 7.006786035398829e-07, "loss": 0.018, "num_tokens": 5995586.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.4528301886792453, "frac_reward_zero_std": 1.0, "grad_norm": 0.017178487496683183, "kl": 0.001728057861328125, "learning_rate": 7.002824304407442e-07, "loss": 0.0001, "num_tokens": 5999710.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.4530978188143985, "frac_reward_zero_std": 0.5, "grad_norm": 1.1631316810558805, "kl": 0.00289154052734375, "learning_rate": 6.998861261958651e-07, "loss": -0.0195, "num_tokens": 6003478.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 256.75, "completions/mean_terminated_length": 256.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.4533654489495517, "frac_reward_zero_std": 1.0, "grad_norm": 0.024019819663775626, "kl": 0.002227783203125, "learning_rate": 6.994896911510852e-07, "loss": 0.0001, "num_tokens": 6006624.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 384.375, "completions/mean_terminated_length": 384.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.45363307908470496, "frac_reward_zero_std": 1.0, "grad_norm": 0.025120985229794102, "kl": 0.00182342529296875, "learning_rate": 6.990931256523583e-07, "loss": 0.0001, "num_tokens": 6011119.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 418.25, "completions/mean_terminated_length": 418.25, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.45390070921985815, "frac_reward_zero_std": 1.0, "grad_norm": 0.013065085966787545, "kl": 0.00150299072265625, "learning_rate": 6.986964300457517e-07, "loss": 0.0001, "num_tokens": 6015557.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4541683393550114, "frac_reward_zero_std": 1.0, "grad_norm": 0.031193434211914613, "kl": 0.00223541259765625, "learning_rate": 6.982996046774468e-07, "loss": 0.0001, "num_tokens": 6018482.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.45443596949016457, "frac_reward_zero_std": 1.0, "grad_norm": 0.06262178146519036, "kl": 0.003772735595703125, "learning_rate": 6.979026498937379e-07, "loss": 0.0002, "num_tokens": 6021536.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 391.625, "completions/mean_terminated_length": 391.625, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.4547035996253178, "frac_reward_zero_std": 1.0, "grad_norm": 0.008917060097067277, "kl": 0.0005950927734375, "learning_rate": 6.975055660410324e-07, "loss": 0.0, "num_tokens": 6025769.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.45497122976047105, "frac_reward_zero_std": 1.0, "grad_norm": 0.013230556779347112, "kl": 0.00075531005859375, "learning_rate": 6.9710835346585e-07, "loss": 0.0, "num_tokens": 6028471.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 368.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.45523885989562424, "frac_reward_zero_std": 1.0, "grad_norm": 0.05846146371285195, "kl": 0.00600433349609375, "learning_rate": 6.967110125148229e-07, "loss": 0.0002, "num_tokens": 6032556.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.4555064900307775, "frac_reward_zero_std": 1.0, "grad_norm": 0.03477776630328761, "kl": 0.00312042236328125, "learning_rate": 6.963135435346957e-07, "loss": 0.0001, "num_tokens": 6035611.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 319.375, "completions/mean_terminated_length": 319.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.45577412016593066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0290507346188508, "kl": 0.003875732421875, "learning_rate": 6.959159468723241e-07, "loss": 0.0002, "num_tokens": 6039294.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.4560417503010839, "frac_reward_zero_std": 1.0, "grad_norm": 0.12979665109011596, "kl": 0.00677490234375, "learning_rate": 6.955182228746756e-07, "loss": 0.0003, "num_tokens": 6042290.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 266.75, "completions/mean_terminated_length": 266.75, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.45630938043623714, "frac_reward_zero_std": 0.5, "grad_norm": 1.2945310500215907, "kl": 0.00254058837890625, "learning_rate": 6.95120371888829e-07, "loss": 0.0376, "num_tokens": 6045408.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.45657701057139033, "frac_reward_zero_std": 1.0, "grad_norm": 0.0780661215062084, "kl": 0.0057773590087890625, "learning_rate": 6.947223942619733e-07, "loss": 0.0002, "num_tokens": 6048233.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.45684464070654357, "frac_reward_zero_std": 1.0, "grad_norm": 0.027029772571437374, "kl": 0.0033111572265625, "learning_rate": 6.943242903414086e-07, "loss": 0.0001, "num_tokens": 6051715.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.45711227084169676, "frac_reward_zero_std": 1.0, "grad_norm": 0.1165522572554516, "kl": 0.009521484375, "learning_rate": 6.939260604745449e-07, "loss": 0.0004, "num_tokens": 6054798.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 240.0, "completions/max_terminated_length": 240.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.45737990097685, "frac_reward_zero_std": 1.0, "grad_norm": 0.012548749274167052, "kl": 0.0006952285766601562, "learning_rate": 6.935277050089025e-07, "loss": 0.0, "num_tokens": 6057521.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.45764753111200324, "frac_reward_zero_std": 1.0, "grad_norm": 0.080244935718362, "kl": 0.01031494140625, "learning_rate": 6.931292242921105e-07, "loss": 0.0004, "num_tokens": 6060790.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 281.375, "completions/mean_terminated_length": 281.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.4579151612471564, "frac_reward_zero_std": 1.0, "grad_norm": 0.03810518191880963, "kl": 0.00324249267578125, "learning_rate": 6.927306186719083e-07, "loss": 0.0001, "num_tokens": 6064029.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.45818279138230966, "frac_reward_zero_std": 1.0, "grad_norm": 0.01806377529710033, "kl": 0.00202178955078125, "learning_rate": 6.923318884961433e-07, "loss": 0.0001, "num_tokens": 6067126.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 246.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.45845042151746285, "frac_reward_zero_std": 1.0, "grad_norm": 0.11567624298426513, "kl": 0.005157470703125, "learning_rate": 6.919330341127726e-07, "loss": 0.0002, "num_tokens": 6070024.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.4587180516526161, "frac_reward_zero_std": 1.0, "grad_norm": 0.010308721004141406, "kl": 0.00095367431640625, "learning_rate": 6.915340558698608e-07, "loss": 0.0, "num_tokens": 6073081.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.45898568178776933, "frac_reward_zero_std": 1.0, "grad_norm": 0.05718839193658793, "kl": 0.0111236572265625, "learning_rate": 6.911349541155811e-07, "loss": 0.0004, "num_tokens": 6076346.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 326.75, "completions/mean_terminated_length": 326.75, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.4592533119229225, "frac_reward_zero_std": 1.0, "grad_norm": 0.033467854366997556, "kl": 0.00313568115234375, "learning_rate": 6.907357291982146e-07, "loss": 0.0001, "num_tokens": 6080280.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 339.5, "completions/mean_terminated_length": 339.5, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.45952094205807575, "frac_reward_zero_std": 1.0, "grad_norm": 0.018335191494930195, "kl": 0.001678466796875, "learning_rate": 6.903363814661496e-07, "loss": 0.0001, "num_tokens": 6083972.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.45978857219322894, "frac_reward_zero_std": 1.0, "grad_norm": 0.05569082237856345, "kl": 0.0059814453125, "learning_rate": 6.899369112678814e-07, "loss": 0.0002, "num_tokens": 6086936.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4600562023283822, "frac_reward_zero_std": 1.0, "grad_norm": 0.06714649369086342, "kl": 0.0061492919921875, "learning_rate": 6.895373189520123e-07, "loss": 0.0002, "num_tokens": 6090166.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4603238324635354, "frac_reward_zero_std": 1.0, "grad_norm": 0.02251504167203449, "kl": 0.0022125244140625, "learning_rate": 6.891376048672516e-07, "loss": 0.0001, "num_tokens": 6093287.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 275.875, "completions/mean_terminated_length": 275.875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.4605914625986886, "frac_reward_zero_std": 1.0, "grad_norm": 0.015918724912198697, "kl": 0.001323699951171875, "learning_rate": 6.887377693624143e-07, "loss": 0.0001, "num_tokens": 6096446.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.46085909273384185, "frac_reward_zero_std": 1.0, "grad_norm": 0.023637613588293926, "kl": 0.0015411376953125, "learning_rate": 6.883378127864218e-07, "loss": 0.0001, "num_tokens": 6099826.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 313.75, "completions/mean_terminated_length": 313.75, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.46112672286899503, "frac_reward_zero_std": 1.0, "grad_norm": 0.026180425475658457, "kl": 0.00238037109375, "learning_rate": 6.879377354883008e-07, "loss": 0.0001, "num_tokens": 6103376.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 298.375, "completions/mean_terminated_length": 298.375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.46139435300414827, "frac_reward_zero_std": 1.0, "grad_norm": 0.035854430950948785, "kl": 0.00330352783203125, "learning_rate": 6.875375378171832e-07, "loss": 0.0001, "num_tokens": 6106783.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.4616619831393015, "frac_reward_zero_std": 1.0, "grad_norm": 0.03992368227660792, "kl": 0.0057525634765625, "learning_rate": 6.871372201223067e-07, "loss": 0.0002, "num_tokens": 6109968.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.4619296132744547, "frac_reward_zero_std": 0.5, "grad_norm": 0.5893389147268271, "kl": 0.00237274169921875, "learning_rate": 6.867367827530131e-07, "loss": 0.0862, "num_tokens": 6113187.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 184.0, "completions/max_terminated_length": 184.0, "completions/mean_length": 161.625, "completions/mean_terminated_length": 161.625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.46219724340960794, "frac_reward_zero_std": 1.0, "grad_norm": 0.01248203712585921, "kl": 0.000606536865234375, "learning_rate": 6.863362260587486e-07, "loss": 0.0, "num_tokens": 6115384.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4624648735447611, "frac_reward_zero_std": 1.0, "grad_norm": 0.05991098872488089, "kl": 0.0030975341796875, "learning_rate": 6.859355503890642e-07, "loss": 0.0001, "num_tokens": 6118427.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 249.25, "completions/mean_terminated_length": 249.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.46273250367991436, "frac_reward_zero_std": 1.0, "grad_norm": 0.07218384542060849, "kl": 0.0055694580078125, "learning_rate": 6.855347560936139e-07, "loss": 0.0002, "num_tokens": 6121345.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.46300013381506755, "frac_reward_zero_std": 1.0, "grad_norm": 0.020469002186876223, "kl": 0.00208282470703125, "learning_rate": 6.851338435221555e-07, "loss": 0.0001, "num_tokens": 6125296.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 245.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.4632677639502208, "frac_reward_zero_std": 1.0, "grad_norm": 0.01865539905482423, "kl": 0.001445770263671875, "learning_rate": 6.847328130245505e-07, "loss": 0.0001, "num_tokens": 6128236.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.46353539408537403, "frac_reward_zero_std": 1.0, "grad_norm": 0.012377507169034171, "kl": 0.000858306884765625, "learning_rate": 6.843316649507625e-07, "loss": 0.0, "num_tokens": 6131823.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.4638030242205272, "frac_reward_zero_std": 1.0, "grad_norm": 0.042962973895883154, "kl": 0.0030517578125, "learning_rate": 6.839303996508583e-07, "loss": 0.0001, "num_tokens": 6135025.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 330.75, "completions/mean_terminated_length": 330.75, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.46407065435568046, "frac_reward_zero_std": 1.0, "grad_norm": 0.016294246658992876, "kl": 0.001556396484375, "learning_rate": 6.835290174750069e-07, "loss": 0.0001, "num_tokens": 6138783.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 347.875, "completions/mean_terminated_length": 347.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.46433828449083364, "frac_reward_zero_std": 1.0, "grad_norm": 0.061683492119676796, "kl": 0.0039215087890625, "learning_rate": 6.83127518773479e-07, "loss": 0.0002, "num_tokens": 6142798.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.4646059146259869, "frac_reward_zero_std": 1.0, "grad_norm": 0.015645644948197864, "kl": 0.0015125274658203125, "learning_rate": 6.827259038966473e-07, "loss": 0.0001, "num_tokens": 6145718.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.4648735447611401, "frac_reward_zero_std": 1.0, "grad_norm": 0.016325394962264792, "kl": 0.001720428466796875, "learning_rate": 6.823241731949859e-07, "loss": 0.0001, "num_tokens": 6150057.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.4651411748962933, "frac_reward_zero_std": 1.0, "grad_norm": 0.015693032653560566, "kl": 0.00183868408203125, "learning_rate": 6.819223270190694e-07, "loss": 0.0001, "num_tokens": 6153122.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.46540880503144655, "frac_reward_zero_std": 1.0, "grad_norm": 0.02778740822732079, "kl": 0.00362396240234375, "learning_rate": 6.815203657195742e-07, "loss": 0.0001, "num_tokens": 6156915.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 396.125, "completions/mean_terminated_length": 396.125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.46567643516659973, "frac_reward_zero_std": 1.0, "grad_norm": 0.016775558254881473, "kl": 0.00173187255859375, "learning_rate": 6.811182896472762e-07, "loss": 0.0001, "num_tokens": 6161160.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 560.0, "completions/max_terminated_length": 560.0, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.465944065301753, "frac_reward_zero_std": 0.5, "grad_norm": 0.8033474804507165, "kl": 0.00278472900390625, "learning_rate": 6.807160991530519e-07, "loss": -0.0075, "num_tokens": 6165293.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.4662116954369062, "frac_reward_zero_std": 1.0, "grad_norm": 0.034055935850790586, "kl": 0.00284576416015625, "learning_rate": 6.803137945878779e-07, "loss": 0.0001, "num_tokens": 6168996.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 338.875, "completions/mean_terminated_length": 338.875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.4664793255720594, "frac_reward_zero_std": 1.0, "grad_norm": 0.01605397494402964, "kl": 0.001583099365234375, "learning_rate": 6.799113763028294e-07, "loss": 0.0001, "num_tokens": 6172803.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.46674695570721264, "frac_reward_zero_std": 1.0, "grad_norm": 0.023198710951097438, "kl": 0.001483917236328125, "learning_rate": 6.79508844649082e-07, "loss": 0.0001, "num_tokens": 6176280.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.4670145858423658, "frac_reward_zero_std": 1.0, "grad_norm": 0.014304455780758316, "kl": 0.001079559326171875, "learning_rate": 6.791061999779095e-07, "loss": 0.0, "num_tokens": 6179240.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 338.0, "completions/mean_terminated_length": 338.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.46728221597751907, "frac_reward_zero_std": 1.0, "grad_norm": 0.01409770749577291, "kl": 0.0010547637939453125, "learning_rate": 6.787034426406849e-07, "loss": 0.0, "num_tokens": 6183048.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 224.125, "completions/mean_terminated_length": 224.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.4675498461126723, "frac_reward_zero_std": 1.0, "grad_norm": 0.020055475323712036, "kl": 0.001529693603515625, "learning_rate": 6.783005729888786e-07, "loss": 0.0001, "num_tokens": 6185889.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 225.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4678174762478255, "frac_reward_zero_std": 1.0, "grad_norm": 0.014171398425757039, "kl": 0.0012359619140625, "learning_rate": 6.778975913740599e-07, "loss": 0.0, "num_tokens": 6188598.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 408.625, "completions/mean_terminated_length": 408.625, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.46808510638297873, "frac_reward_zero_std": 0.5, "grad_norm": 0.6707061990910149, "kl": 0.002227783203125, "learning_rate": 6.774944981478952e-07, "loss": 0.0229, "num_tokens": 6192979.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 775.0, "completions/max_terminated_length": 775.0, "completions/mean_length": 433.875, "completions/mean_terminated_length": 433.875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.4683527365181319, "frac_reward_zero_std": 0.5, "grad_norm": 0.5064438672892013, "kl": 0.0012359619140625, "learning_rate": 6.770912936621489e-07, "loss": 0.0616, "num_tokens": 6197426.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.46862036665328516, "frac_reward_zero_std": 0.5, "grad_norm": 0.6584740428002547, "kl": 0.00244140625, "learning_rate": 6.766879782686821e-07, "loss": -0.0308, "num_tokens": 6200840.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 278.75, "completions/mean_terminated_length": 278.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.4688879967884384, "frac_reward_zero_std": 1.0, "grad_norm": 0.04147895999020293, "kl": 0.00323486328125, "learning_rate": 6.762845523194526e-07, "loss": 0.0001, "num_tokens": 6204122.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.4691556269235916, "frac_reward_zero_std": 1.0, "grad_norm": 0.022381708962702983, "kl": 0.001377105712890625, "learning_rate": 6.758810161665147e-07, "loss": 0.0001, "num_tokens": 6207970.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 335.625, "completions/mean_terminated_length": 335.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4694232570587448, "frac_reward_zero_std": 1.0, "grad_norm": 0.014430917459854885, "kl": 0.00115966796875, "learning_rate": 6.754773701620194e-07, "loss": 0.0, "num_tokens": 6211703.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 278.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.469690887193898, "frac_reward_zero_std": 1.0, "grad_norm": 0.06459107854464331, "kl": 0.00537109375, "learning_rate": 6.750736146582129e-07, "loss": 0.0002, "num_tokens": 6215006.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.46995851732905125, "frac_reward_zero_std": 1.0, "grad_norm": 0.022092834583049517, "kl": 0.00220489501953125, "learning_rate": 6.746697500074373e-07, "loss": 0.0001, "num_tokens": 6218177.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.4702261474642045, "frac_reward_zero_std": 0.0, "grad_norm": 1.3669159292370148, "kl": 0.0040435791015625, "learning_rate": 6.742657765621299e-07, "loss": 0.0426, "num_tokens": 6222497.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 233.25, "completions/mean_terminated_length": 233.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.4704937775993577, "frac_reward_zero_std": 1.0, "grad_norm": 0.015017598871453264, "kl": 0.00109100341796875, "learning_rate": 6.738616946748228e-07, "loss": 0.0, "num_tokens": 6225391.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.4707614077345109, "frac_reward_zero_std": 1.0, "grad_norm": 0.03031608338011012, "kl": 0.001964569091796875, "learning_rate": 6.734575046981429e-07, "loss": 0.0001, "num_tokens": 6229008.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 231.75, "completions/mean_terminated_length": 231.75, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.4710290378696641, "frac_reward_zero_std": 1.0, "grad_norm": 0.042545698313104605, "kl": 0.0027313232421875, "learning_rate": 6.730532069848113e-07, "loss": 0.0001, "num_tokens": 6232122.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.47129666800481734, "frac_reward_zero_std": 1.0, "grad_norm": 0.016858530833446784, "kl": 0.00154876708984375, "learning_rate": 6.72648801887643e-07, "loss": 0.0001, "num_tokens": 6234886.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 287.875, "completions/mean_terminated_length": 287.875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.4715642981399706, "frac_reward_zero_std": 1.0, "grad_norm": 0.012852712805459684, "kl": 0.0011444091796875, "learning_rate": 6.722442897595474e-07, "loss": 0.0, "num_tokens": 6238273.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.47183192827512377, "frac_reward_zero_std": 1.0, "grad_norm": 0.020564978786252887, "kl": 0.002063751220703125, "learning_rate": 6.718396709535265e-07, "loss": 0.0001, "num_tokens": 6242085.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.472099558410277, "frac_reward_zero_std": 1.0, "grad_norm": 0.015898674530030354, "kl": 0.00128173828125, "learning_rate": 6.714349458226755e-07, "loss": 0.0001, "num_tokens": 6245583.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 237.5, "completions/mean_terminated_length": 237.5, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4723671885454302, "frac_reward_zero_std": 1.0, "grad_norm": 0.03020754741261687, "kl": 0.002040863037109375, "learning_rate": 6.710301147201829e-07, "loss": 0.0001, "num_tokens": 6248455.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 254.875, "completions/mean_terminated_length": 254.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.47263481868058344, "frac_reward_zero_std": 1.0, "grad_norm": 0.05652165903062416, "kl": 0.00177764892578125, "learning_rate": 6.706251779993292e-07, "loss": 0.0001, "num_tokens": 6251594.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 218.875, "completions/mean_terminated_length": 218.875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.4729024488157367, "frac_reward_zero_std": 1.0, "grad_norm": 0.015434624098680126, "kl": 0.0014896392822265625, "learning_rate": 6.702201360134872e-07, "loss": 0.0001, "num_tokens": 6254369.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 184.75, "completions/mean_terminated_length": 184.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.47317007895088986, "frac_reward_zero_std": 1.0, "grad_norm": 0.023960275207173986, "kl": 0.002227783203125, "learning_rate": 6.698149891161216e-07, "loss": 0.0001, "num_tokens": 6256707.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 247.25, "completions/mean_terminated_length": 247.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4734377090860431, "frac_reward_zero_std": 1.0, "grad_norm": 0.018336556985394666, "kl": 0.0018463134765625, "learning_rate": 6.694097376607887e-07, "loss": 0.0001, "num_tokens": 6259689.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.4737053392211963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0491168714190417, "kl": 0.002655029296875, "learning_rate": 6.690043820011361e-07, "loss": 0.0001, "num_tokens": 6262753.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.4739729693563495, "frac_reward_zero_std": 1.0, "grad_norm": 0.023237528051327214, "kl": 0.0026397705078125, "learning_rate": 6.685989224909017e-07, "loss": 0.0001, "num_tokens": 6265980.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.47424059949150277, "frac_reward_zero_std": 1.0, "grad_norm": 0.06613445963734903, "kl": 0.0035400390625, "learning_rate": 6.68193359483915e-07, "loss": 0.0001, "num_tokens": 6269709.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 364.75, "completions/mean_terminated_length": 364.75, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.47450822962665595, "frac_reward_zero_std": 1.0, "grad_norm": 0.07801202534050744, "kl": 0.003997802734375, "learning_rate": 6.677876933340951e-07, "loss": 0.0002, "num_tokens": 6273739.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 255.75, "completions/mean_terminated_length": 255.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.4747758597618092, "frac_reward_zero_std": 1.0, "grad_norm": 0.01641585777190743, "kl": 0.001354217529296875, "learning_rate": 6.673819243954515e-07, "loss": 0.0001, "num_tokens": 6276709.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.4750434898969624, "frac_reward_zero_std": 1.0, "grad_norm": 0.014398533866180403, "kl": 0.001556396484375, "learning_rate": 6.669760530220836e-07, "loss": 0.0001, "num_tokens": 6280099.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4753111200321156, "frac_reward_zero_std": 1.0, "grad_norm": 0.03519491389854836, "kl": 0.002109527587890625, "learning_rate": 6.665700795681793e-07, "loss": 0.0001, "num_tokens": 6283330.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.47557875016726886, "frac_reward_zero_std": 1.0, "grad_norm": 0.034883456529126264, "kl": 0.0019378662109375, "learning_rate": 6.661640043880162e-07, "loss": 0.0001, "num_tokens": 6287122.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 429.25, "completions/mean_terminated_length": 429.25, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.47584638030242205, "frac_reward_zero_std": 1.0, "grad_norm": 0.01609305037623409, "kl": 0.001636505126953125, "learning_rate": 6.657578278359609e-07, "loss": 0.0001, "num_tokens": 6291744.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 346.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4761140104375753, "frac_reward_zero_std": 1.0, "grad_norm": 0.026329619056071372, "kl": 0.00194549560546875, "learning_rate": 6.653515502664679e-07, "loss": 0.0001, "num_tokens": 6295644.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.47638164057272847, "frac_reward_zero_std": 1.0, "grad_norm": 0.013015449510133144, "kl": 0.001590728759765625, "learning_rate": 6.649451720340804e-07, "loss": 0.0001, "num_tokens": 6299449.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.4766492707078817, "frac_reward_zero_std": 1.0, "grad_norm": 0.0271875482532482, "kl": 0.00252532958984375, "learning_rate": 6.645386934934289e-07, "loss": 0.0001, "num_tokens": 6302676.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.47691690084303495, "frac_reward_zero_std": 1.0, "grad_norm": 0.05070212473632772, "kl": 0.0027027130126953125, "learning_rate": 6.641321149992319e-07, "loss": 0.0001, "num_tokens": 6305453.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 308.125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.47718453097818814, "frac_reward_zero_std": 1.0, "grad_norm": 0.014997144394934872, "kl": 0.001384735107421875, "learning_rate": 6.637254369062948e-07, "loss": 0.0001, "num_tokens": 6308934.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 363.625, "completions/mean_terminated_length": 363.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.4774521611133414, "frac_reward_zero_std": 1.0, "grad_norm": 0.01946720736757376, "kl": 0.00273895263671875, "learning_rate": 6.633186595695101e-07, "loss": 0.0001, "num_tokens": 6312963.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.47771979124849456, "frac_reward_zero_std": 1.0, "grad_norm": 0.014688088429350647, "kl": 0.001300811767578125, "learning_rate": 6.629117833438568e-07, "loss": 0.0001, "num_tokens": 6315911.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 848.0, "completions/mean_length": 531.25, "completions/mean_terminated_length": 460.857177734375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.4779874213836478, "frac_reward_zero_std": 0.5, "grad_norm": 0.40865536222101717, "kl": 0.00168609619140625, "learning_rate": 6.625048085844004e-07, "loss": 0.0735, "num_tokens": 6321425.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.47825505151880104, "frac_reward_zero_std": 1.0, "grad_norm": 0.015906759386552078, "kl": 0.001171112060546875, "learning_rate": 6.620977356462923e-07, "loss": 0.0, "num_tokens": 6324346.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.47852268165395423, "frac_reward_zero_std": 1.0, "grad_norm": 0.012305036187531006, "kl": 0.000926971435546875, "learning_rate": 6.616905648847692e-07, "loss": 0.0, "num_tokens": 6327967.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.47879031178910747, "frac_reward_zero_std": 0.5, "grad_norm": 1.522414310256531, "kl": 0.00140380859375, "learning_rate": 6.612832966551536e-07, "loss": 0.005, "num_tokens": 6331562.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.47905794192426066, "frac_reward_zero_std": 1.0, "grad_norm": 0.010781548772352963, "kl": 0.0006999969482421875, "learning_rate": 6.60875931312853e-07, "loss": 0.0, "num_tokens": 6334600.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 176.625, "completions/mean_terminated_length": 176.625, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.4793255720594139, "frac_reward_zero_std": 1.0, "grad_norm": 0.015872602287748963, "kl": 0.0010986328125, "learning_rate": 6.604684692133596e-07, "loss": 0.0, "num_tokens": 6337017.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 534.0, "completions/max_terminated_length": 534.0, "completions/mean_length": 398.0, "completions/mean_terminated_length": 398.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.4795932021945671, "frac_reward_zero_std": 1.0, "grad_norm": 0.010999585552240183, "kl": 0.001270294189453125, "learning_rate": 6.6006091071225e-07, "loss": 0.0001, "num_tokens": 6341409.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4798608323297203, "frac_reward_zero_std": 1.0, "grad_norm": 0.029305769638892407, "kl": 0.00373077392578125, "learning_rate": 6.596532561651848e-07, "loss": 0.0001, "num_tokens": 6344579.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 294.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.48012846246487356, "frac_reward_zero_std": 1.0, "grad_norm": 0.03295509109030225, "kl": 0.00214385986328125, "learning_rate": 6.592455059279088e-07, "loss": 0.0001, "num_tokens": 6348046.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 246.125, "completions/mean_terminated_length": 246.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.48039609260002675, "frac_reward_zero_std": 1.0, "grad_norm": 0.018593211991625135, "kl": 0.0029449462890625, "learning_rate": 6.5883766035625e-07, "loss": 0.0001, "num_tokens": 6351051.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 260.25, "completions/mean_terminated_length": 260.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.48066372273518, "frac_reward_zero_std": 0.5, "grad_norm": 0.9802203939797238, "kl": 0.00191497802734375, "learning_rate": 6.584297198061196e-07, "loss": 0.0085, "num_tokens": 6354193.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 318.25, "completions/mean_terminated_length": 318.25, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.4809313528703332, "frac_reward_zero_std": 1.0, "grad_norm": 0.026560176882745472, "kl": 0.00118255615234375, "learning_rate": 6.580216846335117e-07, "loss": 0.0, "num_tokens": 6357843.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 400.375, "completions/mean_terminated_length": 400.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.4811989830054864, "frac_reward_zero_std": 0.5, "grad_norm": 0.7487203412433192, "kl": 0.00244903564453125, "learning_rate": 6.576135551945031e-07, "loss": -0.0359, "num_tokens": 6362162.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.48146661314063965, "frac_reward_zero_std": 1.0, "grad_norm": 0.02055682485463047, "kl": 0.00185394287109375, "learning_rate": 6.572053318452528e-07, "loss": 0.0001, "num_tokens": 6365713.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.48173424327579284, "frac_reward_zero_std": 1.0, "grad_norm": 0.026770983825148274, "kl": 0.0019702911376953125, "learning_rate": 6.567970149420017e-07, "loss": 0.0001, "num_tokens": 6368931.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 333.5, "completions/mean_terminated_length": 333.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.4820018734109461, "frac_reward_zero_std": 1.0, "grad_norm": 0.01710022562677612, "kl": 0.001678466796875, "learning_rate": 6.563886048410723e-07, "loss": 0.0001, "num_tokens": 6372771.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 312.25, "completions/mean_terminated_length": 312.25, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.48226950354609927, "frac_reward_zero_std": 1.0, "grad_norm": 0.014434677844016522, "kl": 0.001361846923828125, "learning_rate": 6.559801018988686e-07, "loss": 0.0001, "num_tokens": 6376309.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 349.25, "completions/mean_terminated_length": 349.25, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.4825371336812525, "frac_reward_zero_std": 1.0, "grad_norm": 0.02513805989114566, "kl": 0.0021209716796875, "learning_rate": 6.555715064718755e-07, "loss": 0.0001, "num_tokens": 6380151.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.48280476381640575, "frac_reward_zero_std": 1.0, "grad_norm": 0.02157842705460585, "kl": 0.001392364501953125, "learning_rate": 6.551628189166588e-07, "loss": 0.0001, "num_tokens": 6383318.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 323.0, "completions/mean_terminated_length": 323.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.48307239395155893, "frac_reward_zero_std": 1.0, "grad_norm": 0.014967036479109972, "kl": 0.0013885498046875, "learning_rate": 6.547540395898643e-07, "loss": 0.0001, "num_tokens": 6387410.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 363.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.48334002408671217, "frac_reward_zero_std": 1.0, "grad_norm": 0.013119389141870875, "kl": 0.0010223388671875, "learning_rate": 6.543451688482181e-07, "loss": 0.0, "num_tokens": 6391393.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 220.75, "completions/mean_terminated_length": 220.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.48360765422186536, "frac_reward_zero_std": 1.0, "grad_norm": 0.05452779875895304, "kl": 0.00308990478515625, "learning_rate": 6.539362070485261e-07, "loss": 0.0001, "num_tokens": 6394063.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.4838752843570186, "frac_reward_zero_std": 1.0, "grad_norm": 0.009281846933426501, "kl": 0.00089263916015625, "learning_rate": 6.535271545476739e-07, "loss": 0.0, "num_tokens": 6397711.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 249.0, "completions/mean_terminated_length": 249.0, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.48414291449217184, "frac_reward_zero_std": 1.0, "grad_norm": 0.011833043419396783, "kl": 0.00075531005859375, "learning_rate": 6.531180117026257e-07, "loss": 0.0, "num_tokens": 6400715.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.484410544627325, "frac_reward_zero_std": 1.0, "grad_norm": 0.032106951293698525, "kl": 0.00226593017578125, "learning_rate": 6.52708778870425e-07, "loss": 0.0001, "num_tokens": 6404450.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.48467817476247826, "frac_reward_zero_std": 0.5, "grad_norm": 0.7391930275287216, "kl": 0.001434326171875, "learning_rate": 6.522994564081939e-07, "loss": 0.0152, "num_tokens": 6407859.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.48494580489763145, "frac_reward_zero_std": 1.0, "grad_norm": 0.020048638263087677, "kl": 0.001209259033203125, "learning_rate": 6.518900446731318e-07, "loss": 0.0, "num_tokens": 6411529.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.4852134350327847, "frac_reward_zero_std": 1.0, "grad_norm": 0.015705555804732263, "kl": 0.0015869140625, "learning_rate": 6.514805440225173e-07, "loss": 0.0001, "num_tokens": 6414822.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 358.875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.48548106516793793, "frac_reward_zero_std": 0.5, "grad_norm": 0.46701660530692274, "kl": 0.00193023681640625, "learning_rate": 6.510709548137058e-07, "loss": -0.0246, "num_tokens": 6418701.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 666.0, "completions/max_terminated_length": 666.0, "completions/mean_length": 466.125, "completions/mean_terminated_length": 466.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.4857486953030911, "frac_reward_zero_std": 0.5, "grad_norm": 0.8112884901882014, "kl": 0.0029449462890625, "learning_rate": 6.506612774041301e-07, "loss": 0.0233, "num_tokens": 6423486.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 315.375, "completions/mean_terminated_length": 315.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.48601632543824436, "frac_reward_zero_std": 1.0, "grad_norm": 0.02269890330004527, "kl": 0.0020751953125, "learning_rate": 6.502515121513002e-07, "loss": 0.0001, "num_tokens": 6427033.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 354.75, "completions/mean_terminated_length": 354.75, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.48628395557339754, "frac_reward_zero_std": 1.0, "grad_norm": 0.011925082789270064, "kl": 0.001064300537109375, "learning_rate": 6.498416594128026e-07, "loss": 0.0, "num_tokens": 6430939.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 678.0, "completions/max_terminated_length": 678.0, "completions/mean_length": 411.75, "completions/mean_terminated_length": 411.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.4865515857085508, "frac_reward_zero_std": 0.5, "grad_norm": 0.6660889362987255, "kl": 0.001850128173828125, "learning_rate": 6.494317195462998e-07, "loss": -0.0009, "num_tokens": 6435841.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 252.25, "completions/mean_terminated_length": 252.25, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.486819215843704, "frac_reward_zero_std": 1.0, "grad_norm": 0.041133615038894074, "kl": 0.003139495849609375, "learning_rate": 6.490216929095311e-07, "loss": 0.0001, "num_tokens": 6438851.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.4870868459788572, "frac_reward_zero_std": 0.5, "grad_norm": 0.77925159396831, "kl": 0.002086639404296875, "learning_rate": 6.486115798603109e-07, "loss": -0.0306, "num_tokens": 6442447.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 314.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.48735447611401045, "frac_reward_zero_std": 1.0, "grad_norm": 0.011182582782065046, "kl": 0.001216888427734375, "learning_rate": 6.482013807565291e-07, "loss": 0.0, "num_tokens": 6445930.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 390.0, "completions/mean_terminated_length": 390.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.48762210624916363, "frac_reward_zero_std": 0.5, "grad_norm": 0.6648120111827213, "kl": 0.00200653076171875, "learning_rate": 6.47791095956151e-07, "loss": 0.0648, "num_tokens": 6450154.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 462.0, "completions/mean_terminated_length": 462.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.4878897363843169, "frac_reward_zero_std": 1.0, "grad_norm": 0.017120278245073842, "kl": 0.0017547607421875, "learning_rate": 6.473807258172163e-07, "loss": 0.0001, "num_tokens": 6454894.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.4881573665194701, "frac_reward_zero_std": 0.5, "grad_norm": 0.8220939764978021, "kl": 0.00275421142578125, "learning_rate": 6.469702706978396e-07, "loss": 0.0195, "num_tokens": 6458621.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 310.875, "completions/mean_terminated_length": 310.875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.4884249966546233, "frac_reward_zero_std": 1.0, "grad_norm": 0.013809607135855407, "kl": 0.00159454345703125, "learning_rate": 6.465597309562092e-07, "loss": 0.0001, "num_tokens": 6462484.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 223.75, "completions/mean_terminated_length": 223.75, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.48869262678977654, "frac_reward_zero_std": 1.0, "grad_norm": 0.026537021451291776, "kl": 0.00235748291015625, "learning_rate": 6.461491069505876e-07, "loss": 0.0001, "num_tokens": 6465270.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.4889602569249297, "frac_reward_zero_std": 1.0, "grad_norm": 0.01669228966030521, "kl": 0.00183868408203125, "learning_rate": 6.457383990393105e-07, "loss": 0.0001, "num_tokens": 6468826.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.48922788706008297, "frac_reward_zero_std": 1.0, "grad_norm": 0.022861825529514327, "kl": 0.001621246337890625, "learning_rate": 6.453276075807873e-07, "loss": 0.0001, "num_tokens": 6472454.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 635.0, "completions/max_terminated_length": 635.0, "completions/mean_length": 394.875, "completions/mean_terminated_length": 394.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.4894955171952362, "frac_reward_zero_std": 1.0, "grad_norm": 0.02407569649920703, "kl": 0.002105712890625, "learning_rate": 6.449167329334996e-07, "loss": 0.0001, "num_tokens": 6476641.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.4897631473303894, "frac_reward_zero_std": 1.0, "grad_norm": 0.021238480673255834, "kl": 0.001522064208984375, "learning_rate": 6.445057754560025e-07, "loss": 0.0001, "num_tokens": 6480485.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.49003077746554263, "frac_reward_zero_std": 1.0, "grad_norm": 0.017720009868782003, "kl": 0.00167083740234375, "learning_rate": 6.440947355069225e-07, "loss": 0.0001, "num_tokens": 6483572.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.4902984076006958, "frac_reward_zero_std": 1.0, "grad_norm": 0.02138314885849877, "kl": 0.00176239013671875, "learning_rate": 6.436836134449587e-07, "loss": 0.0001, "num_tokens": 6487046.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 435.625, "completions/mean_terminated_length": 435.625, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.49056603773584906, "frac_reward_zero_std": 1.0, "grad_norm": 0.02780111100319684, "kl": 0.00180816650390625, "learning_rate": 6.432724096288817e-07, "loss": 0.0001, "num_tokens": 6491667.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.4908336678710023, "frac_reward_zero_std": 1.0, "grad_norm": 0.05353020948055324, "kl": 0.00255584716796875, "learning_rate": 6.428611244175334e-07, "loss": 0.0001, "num_tokens": 6495025.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.4911012980061555, "frac_reward_zero_std": 1.0, "grad_norm": 0.023637804284131817, "kl": 0.0017547607421875, "learning_rate": 6.424497581698262e-07, "loss": 0.0001, "num_tokens": 6498551.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.4913689281413087, "frac_reward_zero_std": 1.0, "grad_norm": 0.01293041630064703, "kl": 0.001102447509765625, "learning_rate": 6.420383112447445e-07, "loss": 0.0, "num_tokens": 6501954.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.4916365582764619, "frac_reward_zero_std": 0.5, "grad_norm": 1.001677558969907, "kl": 0.00225067138671875, "learning_rate": 6.416267840013416e-07, "loss": -0.0214, "num_tokens": 6505081.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 345.125, "completions/mean_terminated_length": 345.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.49190418841161515, "frac_reward_zero_std": 1.0, "grad_norm": 0.08423525457293132, "kl": 0.001445770263671875, "learning_rate": 6.41215176798742e-07, "loss": 0.0001, "num_tokens": 6509078.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 445.25, "completions/mean_terminated_length": 445.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.4921718185467684, "frac_reward_zero_std": 0.5, "grad_norm": 0.9706642913770323, "kl": 0.00299072265625, "learning_rate": 6.408034899961397e-07, "loss": 0.0191, "num_tokens": 6513860.0, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 402.875, "completions/mean_terminated_length": 402.875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.4924394486819216, "frac_reward_zero_std": 1.0, "grad_norm": 0.01972326884271181, "kl": 0.001811981201171875, "learning_rate": 6.403917239527978e-07, "loss": 0.0001, "num_tokens": 6518295.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.4927070788170748, "frac_reward_zero_std": 1.0, "grad_norm": 0.027098184266086112, "kl": 0.0013828277587890625, "learning_rate": 6.399798790280487e-07, "loss": 0.0001, "num_tokens": 6522724.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.492974708952228, "frac_reward_zero_std": 1.0, "grad_norm": 0.0158640929517483, "kl": 0.000972747802734375, "learning_rate": 6.39567955581294e-07, "loss": 0.0, "num_tokens": 6525704.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.49324233908738124, "frac_reward_zero_std": 1.0, "grad_norm": 0.01828782385469192, "kl": 0.0024871826171875, "learning_rate": 6.391559539720037e-07, "loss": 0.0001, "num_tokens": 6529291.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 385.375, "completions/mean_terminated_length": 385.375, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.4935099692225345, "frac_reward_zero_std": 0.5, "grad_norm": 0.9711823036343701, "kl": 0.0014286041259765625, "learning_rate": 6.387438745597156e-07, "loss": -0.0077, "num_tokens": 6533538.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 342.625, "completions/mean_terminated_length": 342.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.49377759935768767, "frac_reward_zero_std": 1.0, "grad_norm": 0.016037232365824664, "kl": 0.001216888427734375, "learning_rate": 6.383317177040356e-07, "loss": 0.0, "num_tokens": 6537235.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 381.0, "completions/mean_terminated_length": 381.0, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.4940452294928409, "frac_reward_zero_std": 0.5, "grad_norm": 0.5863178930810073, "kl": 0.00152587890625, "learning_rate": 6.379194837646376e-07, "loss": 0.0331, "num_tokens": 6541483.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.4943128596279941, "frac_reward_zero_std": 1.0, "grad_norm": 0.025379164936336084, "kl": 0.0012645721435546875, "learning_rate": 6.375071731012623e-07, "loss": 0.0001, "num_tokens": 6544868.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 320.625, "completions/mean_terminated_length": 320.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.49458048976314734, "frac_reward_zero_std": 1.0, "grad_norm": 0.02712605121299074, "kl": 0.001926422119140625, "learning_rate": 6.370947860737171e-07, "loss": 0.0001, "num_tokens": 6548461.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 320.375, "completions/mean_terminated_length": 320.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.4948481198983006, "frac_reward_zero_std": 1.0, "grad_norm": 0.015267360430834865, "kl": 0.00154876708984375, "learning_rate": 6.36682323041877e-07, "loss": 0.0001, "num_tokens": 6552188.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 362.5, "completions/mean_terminated_length": 362.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.49511575003345376, "frac_reward_zero_std": 1.0, "grad_norm": 0.04332905665323875, "kl": 0.00293731689453125, "learning_rate": 6.362697843656823e-07, "loss": 0.0001, "num_tokens": 6556192.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.495383380168607, "frac_reward_zero_std": 1.0, "grad_norm": 0.03864934861854214, "kl": 0.003238677978515625, "learning_rate": 6.3585717040514e-07, "loss": 0.0001, "num_tokens": 6560005.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 462.375, "completions/mean_terminated_length": 462.375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.4956510103037602, "frac_reward_zero_std": 0.5, "grad_norm": 0.895944977793859, "kl": 0.00188446044921875, "learning_rate": 6.354444815203224e-07, "loss": 0.0001, "num_tokens": 6564852.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.4959186404389134, "frac_reward_zero_std": 1.0, "grad_norm": 0.020883380591669866, "kl": 0.0016937255859375, "learning_rate": 6.350317180713674e-07, "loss": 0.0001, "num_tokens": 6567352.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.4961862705740666, "frac_reward_zero_std": 1.0, "grad_norm": 0.014618923892336837, "kl": 0.0011119842529296875, "learning_rate": 6.346188804184781e-07, "loss": 0.0, "num_tokens": 6570316.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 340.625, "completions/mean_terminated_length": 340.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.49645390070921985, "frac_reward_zero_std": 1.0, "grad_norm": 0.014077361509076429, "kl": 0.0014801025390625, "learning_rate": 6.342059689219218e-07, "loss": 0.0001, "num_tokens": 6574261.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 339.5, "completions/mean_terminated_length": 339.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.4967215308443731, "frac_reward_zero_std": 0.5, "grad_norm": 0.9066954610183214, "kl": 0.00232696533203125, "learning_rate": 6.337929839420308e-07, "loss": 0.0232, "num_tokens": 6578205.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 272.25, "completions/mean_terminated_length": 272.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.4969891609795263, "frac_reward_zero_std": 1.0, "grad_norm": 0.01990812559580825, "kl": 0.00156402587890625, "learning_rate": 6.333799258392015e-07, "loss": 0.0001, "num_tokens": 6581451.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 372.0, "completions/mean_terminated_length": 372.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.4972567911146795, "frac_reward_zero_std": 1.0, "grad_norm": 0.013590841703176575, "kl": 0.001148223876953125, "learning_rate": 6.329667949738937e-07, "loss": 0.0, "num_tokens": 6585515.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 243.875, "completions/mean_terminated_length": 243.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.4975244212498327, "frac_reward_zero_std": 1.0, "grad_norm": 0.015342626959077186, "kl": 0.0009899139404296875, "learning_rate": 6.325535917066307e-07, "loss": 0.0, "num_tokens": 6588418.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.49779205138498595, "frac_reward_zero_std": 1.0, "grad_norm": 0.026599941854922027, "kl": 0.001491546630859375, "learning_rate": 6.321403163979997e-07, "loss": 0.0001, "num_tokens": 6592048.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 660.0, "completions/max_terminated_length": 660.0, "completions/mean_length": 447.25, "completions/mean_terminated_length": 447.25, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.4980596815201392, "frac_reward_zero_std": 0.5, "grad_norm": 0.6829373729319584, "kl": 0.001384735107421875, "learning_rate": 6.3172696940865e-07, "loss": 0.0565, "num_tokens": 6596790.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.49832731165529237, "frac_reward_zero_std": 1.0, "grad_norm": 0.014820919155698985, "kl": 0.001468658447265625, "learning_rate": 6.313135510992939e-07, "loss": 0.0001, "num_tokens": 6600139.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.4985949417904456, "frac_reward_zero_std": 1.0, "grad_norm": 0.02082673209193816, "kl": 0.001155853271484375, "learning_rate": 6.309000618307057e-07, "loss": 0.0, "num_tokens": 6603280.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 339.375, "completions/mean_terminated_length": 339.375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4988625719255988, "frac_reward_zero_std": 1.0, "grad_norm": 0.018296448736687478, "kl": 0.0016632080078125, "learning_rate": 6.304865019637218e-07, "loss": 0.0001, "num_tokens": 6607315.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 318.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.49913020206075204, "frac_reward_zero_std": 1.0, "grad_norm": 0.018456522465904856, "kl": 0.00246429443359375, "learning_rate": 6.300728718592398e-07, "loss": 0.0001, "num_tokens": 6610847.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.4993978321959053, "frac_reward_zero_std": 1.0, "grad_norm": 0.011714345925164371, "kl": 0.00115966796875, "learning_rate": 6.296591718782192e-07, "loss": 0.0, "num_tokens": 6614667.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 289.25, "completions/mean_terminated_length": 289.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.49966546233105846, "frac_reward_zero_std": 1.0, "grad_norm": 0.022703952991697737, "kl": 0.0016632080078125, "learning_rate": 6.292454023816802e-07, "loss": 0.0001, "num_tokens": 6617969.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.4999330924662117, "frac_reward_zero_std": 1.0, "grad_norm": 0.012782613852903284, "kl": 0.001239776611328125, "learning_rate": 6.288315637307035e-07, "loss": 0.0, "num_tokens": 6621589.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 394.0, "completions/mean_terminated_length": 394.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5002007226013649, "frac_reward_zero_std": 1.0, "grad_norm": 0.018079600589964853, "kl": 0.00174713134765625, "learning_rate": 6.284176562864303e-07, "loss": 0.0001, "num_tokens": 6625901.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5004683527365181, "frac_reward_zero_std": 1.0, "grad_norm": 0.012132948416438279, "kl": 0.00113677978515625, "learning_rate": 6.280036804100621e-07, "loss": 0.0, "num_tokens": 6629448.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 396.625, "completions/mean_terminated_length": 307.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5007359828716713, "frac_reward_zero_std": 0.5, "grad_norm": 0.31563871438409835, "kl": 0.0009403228759765625, "learning_rate": 6.275896364628592e-07, "loss": 0.1457, "num_tokens": 6633597.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 264.125, "completions/mean_terminated_length": 264.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5010036130068246, "frac_reward_zero_std": 1.0, "grad_norm": 0.07371171869232625, "kl": 0.004180908203125, "learning_rate": 6.271755248061425e-07, "loss": 0.0002, "num_tokens": 6636642.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 359.125, "completions/mean_terminated_length": 359.125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5012712431419778, "frac_reward_zero_std": 1.0, "grad_norm": 0.014736094948484, "kl": 0.0020904541015625, "learning_rate": 6.267613458012912e-07, "loss": 0.0001, "num_tokens": 6640571.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.501538873277131, "frac_reward_zero_std": 1.0, "grad_norm": 0.01709933174612408, "kl": 0.001377105712890625, "learning_rate": 6.263470998097438e-07, "loss": 0.0001, "num_tokens": 6644368.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 233.0, "completions/mean_terminated_length": 233.0, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5018065034122843, "frac_reward_zero_std": 1.0, "grad_norm": 0.022731207053053766, "kl": 0.00189971923828125, "learning_rate": 6.259327871929967e-07, "loss": 0.0001, "num_tokens": 6647416.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.5020741335474375, "frac_reward_zero_std": 1.0, "grad_norm": 0.020978616785757805, "kl": 0.0016021728515625, "learning_rate": 6.255184083126048e-07, "loss": 0.0001, "num_tokens": 6650325.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5023417636825906, "frac_reward_zero_std": 1.0, "grad_norm": 0.014408795869260303, "kl": 0.00142669677734375, "learning_rate": 6.251039635301806e-07, "loss": 0.0001, "num_tokens": 6653481.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 244.125, "completions/mean_terminated_length": 244.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5026093938177438, "frac_reward_zero_std": 1.0, "grad_norm": 0.013330436153373813, "kl": 0.001346588134765625, "learning_rate": 6.246894532073944e-07, "loss": 0.0001, "num_tokens": 6656398.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 363.25, "completions/mean_terminated_length": 363.25, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.5028770239528971, "frac_reward_zero_std": 0.5, "grad_norm": 0.6238255775847442, "kl": 0.00168609619140625, "learning_rate": 6.242748777059734e-07, "loss": 0.0169, "num_tokens": 6660324.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 328.375, "completions/mean_terminated_length": 328.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5031446540880503, "frac_reward_zero_std": 1.0, "grad_norm": 0.014759163783960822, "kl": 0.00144195556640625, "learning_rate": 6.238602373877018e-07, "loss": 0.0001, "num_tokens": 6664051.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 272.125, "completions/mean_terminated_length": 272.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5034122842232035, "frac_reward_zero_std": 1.0, "grad_norm": 0.012300726982455538, "kl": 0.000980377197265625, "learning_rate": 6.234455326144208e-07, "loss": 0.0, "num_tokens": 6667364.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 355.75, "completions/mean_terminated_length": 355.75, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.5036799143583568, "frac_reward_zero_std": 1.0, "grad_norm": 0.02012358573267316, "kl": 0.0021209716796875, "learning_rate": 6.230307637480268e-07, "loss": 0.0001, "num_tokens": 6671398.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.50394754449351, "frac_reward_zero_std": 1.0, "grad_norm": 0.03358581512219366, "kl": 0.00151824951171875, "learning_rate": 6.226159311504731e-07, "loss": 0.0001, "num_tokens": 6674576.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 284.875, "completions/mean_terminated_length": 284.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5042151746286632, "frac_reward_zero_std": 1.0, "grad_norm": 0.024918041130222387, "kl": 0.002410888671875, "learning_rate": 6.222010351837683e-07, "loss": 0.0001, "num_tokens": 6678071.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5044828047638164, "frac_reward_zero_std": 1.0, "grad_norm": 0.02214749100765722, "kl": 0.00183868408203125, "learning_rate": 6.217860762099761e-07, "loss": 0.0001, "num_tokens": 6681299.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.5047504348989696, "frac_reward_zero_std": 1.0, "grad_norm": 0.01702248716996958, "kl": 0.002002716064453125, "learning_rate": 6.213710545912157e-07, "loss": 0.0001, "num_tokens": 6684316.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 297.125, "completions/mean_terminated_length": 297.125, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5050180650341228, "frac_reward_zero_std": 1.0, "grad_norm": 0.01245378927023171, "kl": 0.0008544921875, "learning_rate": 6.209559706896602e-07, "loss": 0.0, "num_tokens": 6687601.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.505285695169276, "frac_reward_zero_std": 1.0, "grad_norm": 0.02856661590011905, "kl": 0.00823974609375, "learning_rate": 6.205408248675377e-07, "loss": 0.0003, "num_tokens": 6690601.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.5055533253044293, "frac_reward_zero_std": 1.0, "grad_norm": 0.023266153149694397, "kl": 0.0022430419921875, "learning_rate": 6.2012561748713e-07, "loss": 0.0001, "num_tokens": 6694077.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.5058209554395825, "frac_reward_zero_std": 1.0, "grad_norm": 0.01524077930574621, "kl": 0.001323699951171875, "learning_rate": 6.197103489107725e-07, "loss": 0.0001, "num_tokens": 6697526.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 356.5, "completions/mean_terminated_length": 356.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5060885855747357, "frac_reward_zero_std": 0.5, "grad_norm": 0.8095305794593729, "kl": 0.00339508056640625, "learning_rate": 6.192950195008549e-07, "loss": 0.0417, "num_tokens": 6701458.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.506356215709889, "frac_reward_zero_std": 1.0, "grad_norm": 0.011244115293638314, "kl": 0.0011749267578125, "learning_rate": 6.188796296198191e-07, "loss": 0.0, "num_tokens": 6705002.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5066238458450422, "frac_reward_zero_std": 0.5, "grad_norm": 1.0996808580400186, "kl": 0.001461029052734375, "learning_rate": 6.184641796301595e-07, "loss": 0.0128, "num_tokens": 6708064.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 242.25, "completions/mean_terminated_length": 242.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5068914759801953, "frac_reward_zero_std": 1.0, "grad_norm": 0.029738780731447843, "kl": 0.002471923828125, "learning_rate": 6.180486698944239e-07, "loss": 0.0001, "num_tokens": 6710870.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5071591061153485, "frac_reward_zero_std": 1.0, "grad_norm": 0.048369740284237836, "kl": 0.00283050537109375, "learning_rate": 6.176331007752117e-07, "loss": 0.0001, "num_tokens": 6713922.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.5074267362505018, "frac_reward_zero_std": 1.0, "grad_norm": 0.012831605578054572, "kl": 0.0011081695556640625, "learning_rate": 6.172174726351742e-07, "loss": 0.0, "num_tokens": 6717574.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 371.375, "completions/mean_terminated_length": 371.375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.507694366385655, "frac_reward_zero_std": 1.0, "grad_norm": 0.02211618851058927, "kl": 0.001934051513671875, "learning_rate": 6.168017858370141e-07, "loss": 0.0001, "num_tokens": 6721581.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 322.125, "completions/mean_terminated_length": 322.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5079619965208082, "frac_reward_zero_std": 1.0, "grad_norm": 0.01280777257809266, "kl": 0.0010223388671875, "learning_rate": 6.163860407434856e-07, "loss": 0.0, "num_tokens": 6725358.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5082296266559615, "frac_reward_zero_std": 1.0, "grad_norm": 0.012482710778709386, "kl": 0.001590728759765625, "learning_rate": 6.159702377173934e-07, "loss": 0.0001, "num_tokens": 6728907.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 416.625, "completions/mean_terminated_length": 416.625, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.5084972567911147, "frac_reward_zero_std": 0.5, "grad_norm": 0.6533851509475365, "kl": 0.001461029052734375, "learning_rate": 6.155543771215929e-07, "loss": -0.0556, "num_tokens": 6733352.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5087648869262679, "frac_reward_zero_std": 1.0, "grad_norm": 0.014087575744237595, "kl": 0.00156402587890625, "learning_rate": 6.151384593189901e-07, "loss": 0.0001, "num_tokens": 6736560.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 305.5, "completions/mean_terminated_length": 305.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5090325170614212, "frac_reward_zero_std": 1.0, "grad_norm": 0.02470278062896007, "kl": 0.00232696533203125, "learning_rate": 6.147224846725402e-07, "loss": 0.0001, "num_tokens": 6740052.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5093001471965743, "frac_reward_zero_std": 1.0, "grad_norm": 0.02531243691466241, "kl": 0.0013647079467773438, "learning_rate": 6.143064535452487e-07, "loss": 0.0001, "num_tokens": 6743099.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5095677773317275, "frac_reward_zero_std": 1.0, "grad_norm": 0.015178379499501899, "kl": 0.0009670257568359375, "learning_rate": 6.138903663001699e-07, "loss": 0.0, "num_tokens": 6746996.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5098354074668807, "frac_reward_zero_std": 1.0, "grad_norm": 0.019807521484901405, "kl": 0.00136566162109375, "learning_rate": 6.134742233004072e-07, "loss": 0.0001, "num_tokens": 6750336.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.510103037602034, "frac_reward_zero_std": 1.0, "grad_norm": 0.035356077268609155, "kl": 0.002338409423828125, "learning_rate": 6.130580249091127e-07, "loss": 0.0001, "num_tokens": 6753734.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5103706677371872, "frac_reward_zero_std": 1.0, "grad_norm": 0.02951037842965016, "kl": 0.001392364501953125, "learning_rate": 6.126417714894872e-07, "loss": 0.0001, "num_tokens": 6757227.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 420.25, "completions/mean_terminated_length": 334.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.5106382978723404, "frac_reward_zero_std": 0.5, "grad_norm": 0.6210857986108093, "kl": 0.00337982177734375, "learning_rate": 6.122254634047787e-07, "loss": 0.0972, "num_tokens": 6761937.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 276.75, "completions/mean_terminated_length": 276.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.5109059280074937, "frac_reward_zero_std": 1.0, "grad_norm": 0.019477816431379726, "kl": 0.001506805419921875, "learning_rate": 6.118091010182836e-07, "loss": 0.0001, "num_tokens": 6765199.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 317.125, "completions/mean_terminated_length": 317.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5111735581426469, "frac_reward_zero_std": 1.0, "grad_norm": 0.026463183231575312, "kl": 0.00191497802734375, "learning_rate": 6.113926846933456e-07, "loss": 0.0001, "num_tokens": 6768868.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5114411882778, "frac_reward_zero_std": 1.0, "grad_norm": 0.03152583277922437, "kl": 0.0029144287109375, "learning_rate": 6.109762147933552e-07, "loss": 0.0001, "num_tokens": 6772215.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 361.125, "completions/mean_terminated_length": 361.125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5117088184129533, "frac_reward_zero_std": 0.5, "grad_norm": 0.851671604652491, "kl": 0.002048492431640625, "learning_rate": 6.105596916817496e-07, "loss": 0.0417, "num_tokens": 6776148.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5119764485481065, "frac_reward_zero_std": 1.0, "grad_norm": 0.008103814693662592, "kl": 0.000408172607421875, "learning_rate": 6.101431157220128e-07, "loss": 0.0, "num_tokens": 6779544.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 343.375, "completions/mean_terminated_length": 343.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5122440786832597, "frac_reward_zero_std": 1.0, "grad_norm": 0.012342484276260958, "kl": 0.001129150390625, "learning_rate": 6.097264872776749e-07, "loss": 0.0, "num_tokens": 6783323.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 256.125, "completions/mean_terminated_length": 256.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.5125117088184129, "frac_reward_zero_std": 1.0, "grad_norm": 0.013385117763094059, "kl": 0.000774383544921875, "learning_rate": 6.093098067123112e-07, "loss": 0.0, "num_tokens": 6786528.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5127793389535662, "frac_reward_zero_std": 1.0, "grad_norm": 0.03226251449682221, "kl": 0.00197601318359375, "learning_rate": 6.088930743895435e-07, "loss": 0.0001, "num_tokens": 6789905.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 264.625, "completions/mean_terminated_length": 264.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5130469690887194, "frac_reward_zero_std": 1.0, "grad_norm": 0.01608021934933858, "kl": 0.001384735107421875, "learning_rate": 6.084762906730378e-07, "loss": 0.0001, "num_tokens": 6793110.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.5133145992238726, "frac_reward_zero_std": 1.0, "grad_norm": 0.020023360610819108, "kl": 0.0011386871337890625, "learning_rate": 6.080594559265054e-07, "loss": 0.0, "num_tokens": 6796547.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 454.0, "completions/mean_terminated_length": 454.0, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.5135822293590259, "frac_reward_zero_std": 1.0, "grad_norm": 0.022470410478209604, "kl": 0.001514434814453125, "learning_rate": 6.076425705137019e-07, "loss": 0.0001, "num_tokens": 6801371.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 395.625, "completions/mean_terminated_length": 395.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.513849859494179, "frac_reward_zero_std": 1.0, "grad_norm": 0.019721225445729955, "kl": 0.00157928466796875, "learning_rate": 6.072256347984276e-07, "loss": 0.0001, "num_tokens": 6805568.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5141174896293322, "frac_reward_zero_std": 1.0, "grad_norm": 0.018173371901076204, "kl": 0.000934600830078125, "learning_rate": 6.068086491445263e-07, "loss": 0.0, "num_tokens": 6808949.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5143851197644855, "frac_reward_zero_std": 0.5, "grad_norm": 1.1977156923556274, "kl": 0.001617431640625, "learning_rate": 6.063916139158856e-07, "loss": 0.0212, "num_tokens": 6812481.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5146527498996387, "frac_reward_zero_std": 1.0, "grad_norm": 0.06423138181420708, "kl": 0.002414703369140625, "learning_rate": 6.059745294764357e-07, "loss": 0.0001, "num_tokens": 6815673.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 416.0, "completions/mean_terminated_length": 416.0, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5149203800347919, "frac_reward_zero_std": 0.5, "grad_norm": 1.0034644897660918, "kl": 0.0025177001953125, "learning_rate": 6.055573961901508e-07, "loss": 0.0001, "num_tokens": 6820233.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 293.25, "completions/mean_terminated_length": 293.25, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5151880101699451, "frac_reward_zero_std": 1.0, "grad_norm": 0.022089831945260756, "kl": 0.002040863037109375, "learning_rate": 6.05140214421047e-07, "loss": 0.0001, "num_tokens": 6823663.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.5154556403050984, "frac_reward_zero_std": 1.0, "grad_norm": 0.02234534515679119, "kl": 0.002643585205078125, "learning_rate": 6.047229845331829e-07, "loss": 0.0001, "num_tokens": 6827077.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 278.375, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5157232704402516, "frac_reward_zero_std": 1.0, "grad_norm": 0.01659097170167622, "kl": 0.001293182373046875, "learning_rate": 6.043057068906591e-07, "loss": 0.0001, "num_tokens": 6830256.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 331.25, "completions/mean_terminated_length": 331.25, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.5159909005754048, "frac_reward_zero_std": 1.0, "grad_norm": 0.06134017460924465, "kl": 0.0028533935546875, "learning_rate": 6.03888381857618e-07, "loss": 0.0001, "num_tokens": 6834014.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 461.375, "completions/mean_terminated_length": 461.375, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.516258530710558, "frac_reward_zero_std": 1.0, "grad_norm": 0.014692051046545838, "kl": 0.0016632080078125, "learning_rate": 6.034710097982431e-07, "loss": 0.0001, "num_tokens": 6838937.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 238.0, "completions/max_terminated_length": 238.0, "completions/mean_length": 216.125, "completions/mean_terminated_length": 216.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5165261608457112, "frac_reward_zero_std": 1.0, "grad_norm": 0.016448799972420887, "kl": 0.00118255615234375, "learning_rate": 6.030535910767588e-07, "loss": 0.0, "num_tokens": 6841554.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5167937909808644, "frac_reward_zero_std": 1.0, "grad_norm": 0.012086961496046092, "kl": 0.0012664794921875, "learning_rate": 6.02636126057431e-07, "loss": 0.0001, "num_tokens": 6845872.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.5170614211160177, "frac_reward_zero_std": 1.0, "grad_norm": 0.019232434905684246, "kl": 0.0036163330078125, "learning_rate": 6.022186151045651e-07, "loss": 0.0001, "num_tokens": 6849451.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5173290512511709, "frac_reward_zero_std": 1.0, "grad_norm": 0.02324720573496547, "kl": 0.00244140625, "learning_rate": 6.018010585825073e-07, "loss": 0.0001, "num_tokens": 6853783.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5175966813863241, "frac_reward_zero_std": 1.0, "grad_norm": 0.03508961616384975, "kl": 0.0020599365234375, "learning_rate": 6.01383456855643e-07, "loss": 0.0001, "num_tokens": 6857348.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 403.5, "completions/mean_terminated_length": 403.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.5178643115214773, "frac_reward_zero_std": 1.0, "grad_norm": 0.027709495355542655, "kl": 0.00211334228515625, "learning_rate": 6.009658102883974e-07, "loss": 0.0001, "num_tokens": 6861892.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 409.375, "completions/mean_terminated_length": 409.375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.5181319416566306, "frac_reward_zero_std": 1.0, "grad_norm": 0.008966069228825861, "kl": 0.000865936279296875, "learning_rate": 6.005481192452345e-07, "loss": 0.0, "num_tokens": 6866251.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 445.125, "completions/mean_terminated_length": 445.125, "completions/min_length": 349.0, "completions/min_terminated_length": 349.0, "epoch": 0.5183995717917838, "frac_reward_zero_std": 1.0, "grad_norm": 0.014983907257243984, "kl": 0.0015621185302734375, "learning_rate": 6.001303840906575e-07, "loss": 0.0001, "num_tokens": 6871356.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.5186672019269369, "frac_reward_zero_std": 1.0, "grad_norm": 0.021474023924701775, "kl": 0.001922607421875, "learning_rate": 5.99712605189208e-07, "loss": 0.0001, "num_tokens": 6874473.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5189348320620902, "frac_reward_zero_std": 1.0, "grad_norm": 0.015098114970479836, "kl": 0.0012340545654296875, "learning_rate": 5.992947829054657e-07, "loss": 0.0, "num_tokens": 6877464.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5192024621972434, "frac_reward_zero_std": 1.0, "grad_norm": 0.022911487743820368, "kl": 0.001628875732421875, "learning_rate": 5.988769176040478e-07, "loss": 0.0001, "num_tokens": 6880658.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 301.25, "completions/mean_terminated_length": 301.25, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5194700923323966, "frac_reward_zero_std": 1.0, "grad_norm": 0.014787844676394178, "kl": 0.0018310546875, "learning_rate": 5.984590096496098e-07, "loss": 0.0001, "num_tokens": 6884144.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 424.375, "completions/mean_terminated_length": 424.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.5197377224675499, "frac_reward_zero_std": 1.0, "grad_norm": 0.011954524814248153, "kl": 0.001636505126953125, "learning_rate": 5.980410594068439e-07, "loss": 0.0001, "num_tokens": 6888695.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 351.875, "completions/mean_terminated_length": 351.875, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.5200053526027031, "frac_reward_zero_std": 1.0, "grad_norm": 0.014834896447279965, "kl": 0.0010528564453125, "learning_rate": 5.976230672404793e-07, "loss": 0.0, "num_tokens": 6892454.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5202729827378563, "frac_reward_zero_std": 1.0, "grad_norm": 0.03046134048790715, "kl": 0.001735687255859375, "learning_rate": 5.972050335152818e-07, "loss": 0.0001, "num_tokens": 6895695.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 392.125, "completions/mean_terminated_length": 392.125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.5205406128730095, "frac_reward_zero_std": 0.5, "grad_norm": 0.5605505398582672, "kl": 0.001445770263671875, "learning_rate": 5.967869585960534e-07, "loss": 0.0103, "num_tokens": 6900028.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 227.125, "completions/mean_terminated_length": 227.125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5208082430081628, "frac_reward_zero_std": 1.0, "grad_norm": 0.04267432164097527, "kl": 0.001926422119140625, "learning_rate": 5.963688428476324e-07, "loss": 0.0001, "num_tokens": 6902905.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5210758731433159, "frac_reward_zero_std": 1.0, "grad_norm": 0.018158207000034458, "kl": 0.00180816650390625, "learning_rate": 5.959506866348923e-07, "loss": 0.0001, "num_tokens": 6905965.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.5213435032784691, "frac_reward_zero_std": 1.0, "grad_norm": 0.016567997550904483, "kl": 0.0007572174072265625, "learning_rate": 5.955324903227418e-07, "loss": 0.0, "num_tokens": 6909088.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 329.75, "completions/mean_terminated_length": 329.75, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.5216111334136224, "frac_reward_zero_std": 0.5, "grad_norm": 0.7493232540506716, "kl": 0.0009918212890625, "learning_rate": 5.951142542761252e-07, "loss": 0.0302, "num_tokens": 6912666.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 426.75, "completions/mean_terminated_length": 341.4285888671875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5218787635487756, "frac_reward_zero_std": 0.5, "grad_norm": 0.37628183500012546, "kl": 0.001552581787109375, "learning_rate": 5.946959788600209e-07, "loss": 0.0535, "num_tokens": 6917412.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5221463936839288, "frac_reward_zero_std": 1.0, "grad_norm": 0.0193681151250822, "kl": 0.00213623046875, "learning_rate": 5.942776644394422e-07, "loss": 0.0001, "num_tokens": 6920657.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.522414023819082, "frac_reward_zero_std": 1.0, "grad_norm": 0.018331643464237088, "kl": 0.001430511474609375, "learning_rate": 5.938593113794359e-07, "loss": 0.0001, "num_tokens": 6924194.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 417.875, "completions/mean_terminated_length": 417.875, "completions/min_length": 358.0, "completions/min_terminated_length": 358.0, "epoch": 0.5226816539542353, "frac_reward_zero_std": 0.5, "grad_norm": 0.5585003845297355, "kl": 0.00177001953125, "learning_rate": 5.934409200450827e-07, "loss": 0.0001, "num_tokens": 6928877.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 449.0, "completions/mean_terminated_length": 449.0, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "epoch": 0.5229492840893885, "frac_reward_zero_std": 1.0, "grad_norm": 0.010768395962795692, "kl": 0.0012664794921875, "learning_rate": 5.930224908014969e-07, "loss": 0.0001, "num_tokens": 6933725.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 277.0, "completions/mean_terminated_length": 277.0, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.5232169142245416, "frac_reward_zero_std": 1.0, "grad_norm": 0.016416819535179594, "kl": 0.0012569427490234375, "learning_rate": 5.926040240138257e-07, "loss": 0.0001, "num_tokens": 6936865.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 236.625, "completions/mean_terminated_length": 236.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5234845443596949, "frac_reward_zero_std": 1.0, "grad_norm": 0.02214530291000852, "kl": 0.00186920166015625, "learning_rate": 5.921855200472489e-07, "loss": 0.0001, "num_tokens": 6939710.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 389.75, "completions/mean_terminated_length": 389.75, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5237521744948481, "frac_reward_zero_std": 0.5, "grad_norm": 0.7311506786206561, "kl": 0.002399444580078125, "learning_rate": 5.917669792669791e-07, "loss": -0.0077, "num_tokens": 6944044.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 369.625, "completions/mean_terminated_length": 369.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.5240198046300013, "frac_reward_zero_std": 1.0, "grad_norm": 0.02321980876317193, "kl": 0.0017547607421875, "learning_rate": 5.913484020382611e-07, "loss": 0.0001, "num_tokens": 6948021.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 284.5, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5242874347651546, "frac_reward_zero_std": 1.0, "grad_norm": 0.013859020399629978, "kl": 0.001163482666015625, "learning_rate": 5.909297887263707e-07, "loss": 0.0, "num_tokens": 6951353.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 295.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5245550649003078, "frac_reward_zero_std": 1.0, "grad_norm": 0.012527119803051406, "kl": 0.001270294189453125, "learning_rate": 5.905111396966162e-07, "loss": 0.0001, "num_tokens": 6954897.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.524822695035461, "frac_reward_zero_std": 1.0, "grad_norm": 0.016315121289919028, "kl": 0.001312255859375, "learning_rate": 5.900924553143364e-07, "loss": 0.0001, "num_tokens": 6958267.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.5250903251706142, "frac_reward_zero_std": 1.0, "grad_norm": 0.01536939110187309, "kl": 0.00189208984375, "learning_rate": 5.896737359449014e-07, "loss": 0.0001, "num_tokens": 6962061.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 242.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5253579553057675, "frac_reward_zero_std": 0.5, "grad_norm": 0.8292362117308179, "kl": 0.00164031982421875, "learning_rate": 5.892549819537115e-07, "loss": 0.018, "num_tokens": 6964917.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5256255854409206, "frac_reward_zero_std": 1.0, "grad_norm": 0.01412531894984555, "kl": 0.001682281494140625, "learning_rate": 5.888361937061973e-07, "loss": 0.0001, "num_tokens": 6968067.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5258932155760738, "frac_reward_zero_std": 1.0, "grad_norm": 0.04615812581886723, "kl": 0.00225830078125, "learning_rate": 5.884173715678192e-07, "loss": 0.0001, "num_tokens": 6971680.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5261608457112271, "frac_reward_zero_std": 1.0, "grad_norm": 0.02050927737350705, "kl": 0.0023651123046875, "learning_rate": 5.879985159040675e-07, "loss": 0.0001, "num_tokens": 6975582.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.5264284758463803, "frac_reward_zero_std": 1.0, "grad_norm": 0.02715608320155108, "kl": 0.002269744873046875, "learning_rate": 5.875796270804616e-07, "loss": 0.0001, "num_tokens": 6979156.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 393.375, "completions/mean_terminated_length": 393.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.5266961059815335, "frac_reward_zero_std": 1.0, "grad_norm": 0.012678111023983585, "kl": 0.0013580322265625, "learning_rate": 5.871607054625496e-07, "loss": 0.0001, "num_tokens": 6983431.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 245.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.5269637361166868, "frac_reward_zero_std": 1.0, "grad_norm": 0.06427524708203011, "kl": 0.00279998779296875, "learning_rate": 5.867417514159084e-07, "loss": 0.0001, "num_tokens": 6986415.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.52723136625184, "frac_reward_zero_std": 1.0, "grad_norm": 0.020167876273554923, "kl": 0.001323699951171875, "learning_rate": 5.863227653061433e-07, "loss": 0.0001, "num_tokens": 6989346.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5274989963869932, "frac_reward_zero_std": 0.5, "grad_norm": 1.247958769979468, "kl": 0.00225830078125, "learning_rate": 5.859037474988874e-07, "loss": -0.0094, "num_tokens": 6992972.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5277666265221463, "frac_reward_zero_std": 1.0, "grad_norm": 0.012350945703977597, "kl": 0.0011348724365234375, "learning_rate": 5.854846983598016e-07, "loss": 0.0, "num_tokens": 6995992.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 352.5, "completions/mean_terminated_length": 352.5, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.5280342566572996, "frac_reward_zero_std": 1.0, "grad_norm": 0.014708851806052916, "kl": 0.00124359130859375, "learning_rate": 5.850656182545741e-07, "loss": 0.0001, "num_tokens": 6999756.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 306.125, "completions/mean_terminated_length": 306.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.5283018867924528, "frac_reward_zero_std": 0.5, "grad_norm": 0.8506830595370688, "kl": 0.00429534912109375, "learning_rate": 5.8464650754892e-07, "loss": -0.0048, "num_tokens": 7003633.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 304.625, "completions/mean_terminated_length": 304.625, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.528569516927606, "frac_reward_zero_std": 1.0, "grad_norm": 0.019756512475282092, "kl": 0.00139617919921875, "learning_rate": 5.842273666085816e-07, "loss": 0.0001, "num_tokens": 7007050.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 333.875, "completions/mean_terminated_length": 333.875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5288371470627593, "frac_reward_zero_std": 1.0, "grad_norm": 0.017597819119500743, "kl": 0.0022430419921875, "learning_rate": 5.838081957993268e-07, "loss": 0.0001, "num_tokens": 7010837.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5291047771979125, "frac_reward_zero_std": 1.0, "grad_norm": 0.013902173053089474, "kl": 0.00153350830078125, "learning_rate": 5.8338899548695e-07, "loss": 0.0001, "num_tokens": 7014013.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5293724073330657, "frac_reward_zero_std": 1.0, "grad_norm": 0.017682950278620074, "kl": 0.001453399658203125, "learning_rate": 5.829697660372713e-07, "loss": 0.0001, "num_tokens": 7017642.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.529640037468219, "frac_reward_zero_std": 1.0, "grad_norm": 0.051899094921312, "kl": 0.0046234130859375, "learning_rate": 5.825505078161363e-07, "loss": 0.0002, "num_tokens": 7020848.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.5299076676033722, "frac_reward_zero_std": 1.0, "grad_norm": 0.010833303186899162, "kl": 0.000728607177734375, "learning_rate": 5.821312211894158e-07, "loss": 0.0, "num_tokens": 7024305.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 284.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.5301752977385253, "frac_reward_zero_std": 1.0, "grad_norm": 0.021379978795864805, "kl": 0.001953125, "learning_rate": 5.817119065230051e-07, "loss": 0.0001, "num_tokens": 7027615.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 331.375, "completions/mean_terminated_length": 331.375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5304429278736785, "frac_reward_zero_std": 1.0, "grad_norm": 0.019140384790702087, "kl": 0.001811981201171875, "learning_rate": 5.812925641828241e-07, "loss": 0.0001, "num_tokens": 7031210.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.5307105580088318, "frac_reward_zero_std": 1.0, "grad_norm": 0.017675594979771843, "kl": 0.0023040771484375, "learning_rate": 5.808731945348168e-07, "loss": 0.0001, "num_tokens": 7035000.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.530978188143985, "frac_reward_zero_std": 1.0, "grad_norm": 0.013086083725175371, "kl": 0.0008792877197265625, "learning_rate": 5.804537979449512e-07, "loss": 0.0, "num_tokens": 7037829.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.5312458182791382, "frac_reward_zero_std": 1.0, "grad_norm": 0.01892918848287925, "kl": 0.001255035400390625, "learning_rate": 5.800343747792186e-07, "loss": 0.0001, "num_tokens": 7040758.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.5315134484142915, "frac_reward_zero_std": 1.0, "grad_norm": 0.06459077867151715, "kl": 0.00348663330078125, "learning_rate": 5.796149254036335e-07, "loss": 0.0001, "num_tokens": 7044263.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 368.375, "completions/mean_terminated_length": 368.375, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.5317810785494447, "frac_reward_zero_std": 1.0, "grad_norm": 0.015568410823857133, "kl": 0.000904083251953125, "learning_rate": 5.791954501842337e-07, "loss": 0.0, "num_tokens": 7048274.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.5320487086845979, "frac_reward_zero_std": 1.0, "grad_norm": 0.014283201386543189, "kl": 0.001445770263671875, "learning_rate": 5.787759494870789e-07, "loss": 0.0001, "num_tokens": 7050959.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 352.875, "completions/mean_terminated_length": 352.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.5323163388197512, "frac_reward_zero_std": 1.0, "grad_norm": 0.010523358939872234, "kl": 0.001094818115234375, "learning_rate": 5.783564236782513e-07, "loss": 0.0, "num_tokens": 7054914.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 245.0, "completions/mean_terminated_length": 245.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.5325839689549043, "frac_reward_zero_std": 1.0, "grad_norm": 0.014526371242170646, "kl": 0.00141143798828125, "learning_rate": 5.779368731238553e-07, "loss": 0.0001, "num_tokens": 7057874.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 345.625, "completions/mean_terminated_length": 345.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.5328515990900575, "frac_reward_zero_std": 0.5, "grad_norm": 0.8192389742448113, "kl": 0.002071380615234375, "learning_rate": 5.775172981900166e-07, "loss": 0.0284, "num_tokens": 7061815.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5331192292252107, "frac_reward_zero_std": 1.0, "grad_norm": 0.01776799947503429, "kl": 0.0009136199951171875, "learning_rate": 5.770976992428821e-07, "loss": 0.0, "num_tokens": 7064680.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.533386859360364, "frac_reward_zero_std": 1.0, "grad_norm": 0.011671449732824688, "kl": 0.001007080078125, "learning_rate": 5.7667807664862e-07, "loss": 0.0, "num_tokens": 7068437.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 387.25, "completions/mean_terminated_length": 387.25, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.5336544894955172, "frac_reward_zero_std": 1.0, "grad_norm": 0.019597160794631165, "kl": 0.001483917236328125, "learning_rate": 5.762584307734187e-07, "loss": 0.0001, "num_tokens": 7072687.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 376.75, "completions/mean_terminated_length": 376.75, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.5339221196306704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0159779813551514, "kl": 0.0011138916015625, "learning_rate": 5.758387619834872e-07, "loss": 0.0, "num_tokens": 7076813.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 302.125, "completions/mean_terminated_length": 302.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5341897497658237, "frac_reward_zero_std": 1.0, "grad_norm": 0.025474297327540453, "kl": 0.001483917236328125, "learning_rate": 5.754190706450543e-07, "loss": 0.0001, "num_tokens": 7080274.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 231.875, "completions/mean_terminated_length": 231.875, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5344573799009769, "frac_reward_zero_std": 1.0, "grad_norm": 0.01928853564635077, "kl": 0.001842498779296875, "learning_rate": 5.74999357124369e-07, "loss": 0.0001, "num_tokens": 7083073.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.53472501003613, "frac_reward_zero_std": 1.0, "grad_norm": 0.06942513742006684, "kl": 0.00321197509765625, "learning_rate": 5.745796217876989e-07, "loss": 0.0001, "num_tokens": 7085619.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 301.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5349926401712833, "frac_reward_zero_std": 0.5, "grad_norm": 0.9132285460705964, "kl": 0.00289154052734375, "learning_rate": 5.74159865001331e-07, "loss": -0.0467, "num_tokens": 7088907.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 349.375, "completions/mean_terminated_length": 349.375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5352602703064365, "frac_reward_zero_std": 1.0, "grad_norm": 0.024566367398515903, "kl": 0.0019378662109375, "learning_rate": 5.737400871315714e-07, "loss": 0.0001, "num_tokens": 7092890.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.5355279004415897, "frac_reward_zero_std": 1.0, "grad_norm": 0.02794330781694572, "kl": 0.00257110595703125, "learning_rate": 5.733202885447439e-07, "loss": 0.0001, "num_tokens": 7096421.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5357955305767429, "frac_reward_zero_std": 0.5, "grad_norm": 1.90456302296165, "kl": 0.00579833984375, "learning_rate": 5.729004696071908e-07, "loss": -0.0242, "num_tokens": 7099965.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.5360631607118962, "frac_reward_zero_std": 1.0, "grad_norm": 0.02212881439311567, "kl": 0.0018310546875, "learning_rate": 5.72480630685272e-07, "loss": 0.0001, "num_tokens": 7103453.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5363307908470494, "frac_reward_zero_std": 1.0, "grad_norm": 0.08358136529610152, "kl": 0.005126953125, "learning_rate": 5.720607721453649e-07, "loss": 0.0002, "num_tokens": 7106687.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.5365984209822026, "frac_reward_zero_std": 1.0, "grad_norm": 0.05072555489592743, "kl": 0.00389862060546875, "learning_rate": 5.716408943538642e-07, "loss": 0.0002, "num_tokens": 7110055.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5368660511173559, "frac_reward_zero_std": 1.0, "grad_norm": 0.05400019213366808, "kl": 0.00344085693359375, "learning_rate": 5.712209976771811e-07, "loss": 0.0001, "num_tokens": 7112949.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.537133681252509, "frac_reward_zero_std": 1.0, "grad_norm": 0.07142273690787661, "kl": 0.005096435546875, "learning_rate": 5.708010824817432e-07, "loss": 0.0002, "num_tokens": 7116594.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.5374013113876622, "frac_reward_zero_std": 1.0, "grad_norm": 0.013155527965901583, "kl": 0.00113677978515625, "learning_rate": 5.703811491339947e-07, "loss": 0.0, "num_tokens": 7120514.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5376689415228155, "frac_reward_zero_std": 1.0, "grad_norm": 0.007967713351583618, "kl": 0.000484466552734375, "learning_rate": 5.699611980003954e-07, "loss": 0.0, "num_tokens": 7123269.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 236.125, "completions/mean_terminated_length": 236.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.5379365716579687, "frac_reward_zero_std": 1.0, "grad_norm": 0.05935499329991724, "kl": 0.0034637451171875, "learning_rate": 5.695412294474207e-07, "loss": 0.0001, "num_tokens": 7126070.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5382042017931219, "frac_reward_zero_std": 1.0, "grad_norm": 0.028476726464850364, "kl": 0.0017852783203125, "learning_rate": 5.691212438415608e-07, "loss": 0.0001, "num_tokens": 7129128.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 286.5, "completions/mean_terminated_length": 286.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.5384718319282751, "frac_reward_zero_std": 1.0, "grad_norm": 0.01421097013430564, "kl": 0.00159454345703125, "learning_rate": 5.687012415493213e-07, "loss": 0.0001, "num_tokens": 7132480.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.5387394620634284, "frac_reward_zero_std": 1.0, "grad_norm": 0.07914656798421874, "kl": 0.00391387939453125, "learning_rate": 5.682812229372224e-07, "loss": 0.0002, "num_tokens": 7135483.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 610.0, "completions/max_terminated_length": 610.0, "completions/mean_length": 442.25, "completions/mean_terminated_length": 442.25, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.5390070921985816, "frac_reward_zero_std": 1.0, "grad_norm": 0.025315190788631245, "kl": 0.00276947021484375, "learning_rate": 5.678611883717978e-07, "loss": 0.0001, "num_tokens": 7140285.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5392747223337347, "frac_reward_zero_std": 1.0, "grad_norm": 0.02388402717186468, "kl": 0.001209259033203125, "learning_rate": 5.674411382195962e-07, "loss": 0.0, "num_tokens": 7143073.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.539542352468888, "frac_reward_zero_std": 1.0, "grad_norm": 0.040193548215314165, "kl": 0.002471923828125, "learning_rate": 5.670210728471789e-07, "loss": 0.0001, "num_tokens": 7146429.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.5398099826040412, "frac_reward_zero_std": 0.5, "grad_norm": 1.0503834356099364, "kl": 0.00344085693359375, "learning_rate": 5.666009926211211e-07, "loss": 0.0025, "num_tokens": 7149751.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5400776127391944, "frac_reward_zero_std": 1.0, "grad_norm": 0.017468505034222, "kl": 0.001293182373046875, "learning_rate": 5.661808979080106e-07, "loss": 0.0001, "num_tokens": 7153182.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 398.375, "completions/mean_terminated_length": 398.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.5403452428743476, "frac_reward_zero_std": 1.0, "grad_norm": 0.013490279674168881, "kl": 0.00119781494140625, "learning_rate": 5.657607890744484e-07, "loss": 0.0, "num_tokens": 7157401.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5406128730095009, "frac_reward_zero_std": 1.0, "grad_norm": 0.021687321774051634, "kl": 0.001644134521484375, "learning_rate": 5.65340666487047e-07, "loss": 0.0001, "num_tokens": 7161219.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 346.5, "completions/mean_terminated_length": 346.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5408805031446541, "frac_reward_zero_std": 1.0, "grad_norm": 0.028349931121978676, "kl": 0.001735687255859375, "learning_rate": 5.649205305124316e-07, "loss": 0.0001, "num_tokens": 7165083.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5411481332798073, "frac_reward_zero_std": 1.0, "grad_norm": 0.049658070609964446, "kl": 0.00251007080078125, "learning_rate": 5.645003815172389e-07, "loss": 0.0001, "num_tokens": 7167641.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 402.25, "completions/mean_terminated_length": 402.25, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.5414157634149606, "frac_reward_zero_std": 0.5, "grad_norm": 0.6853483677929477, "kl": 0.0021820068359375, "learning_rate": 5.640802198681166e-07, "loss": -0.0196, "num_tokens": 7172095.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.5416833935501137, "frac_reward_zero_std": 1.0, "grad_norm": 0.012283164081385183, "kl": 0.001094818115234375, "learning_rate": 5.63660045931724e-07, "loss": 0.0, "num_tokens": 7176074.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 233.625, "completions/mean_terminated_length": 233.625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5419510236852669, "frac_reward_zero_std": 1.0, "grad_norm": 0.03999496990164486, "kl": 0.002349853515625, "learning_rate": 5.632398600747307e-07, "loss": 0.0001, "num_tokens": 7179207.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 234.25, "completions/mean_terminated_length": 234.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.5422186538204202, "frac_reward_zero_std": 1.0, "grad_norm": 0.015399467284768533, "kl": 0.00118255615234375, "learning_rate": 5.62819662663817e-07, "loss": 0.0, "num_tokens": 7181985.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.5424862839555734, "frac_reward_zero_std": 0.5, "grad_norm": 1.443915751987066, "kl": 0.0024566650390625, "learning_rate": 5.623994540656729e-07, "loss": -0.0035, "num_tokens": 7185121.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 237.625, "completions/mean_terminated_length": 237.625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5427539140907266, "frac_reward_zero_std": 1.0, "grad_norm": 0.014530235902058335, "kl": 0.00104522705078125, "learning_rate": 5.619792346469987e-07, "loss": 0.0, "num_tokens": 7188318.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 355.25, "completions/mean_terminated_length": 355.25, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.5430215442258798, "frac_reward_zero_std": 1.0, "grad_norm": 0.012313082011472692, "kl": 0.001277923583984375, "learning_rate": 5.615590047745036e-07, "loss": 0.0001, "num_tokens": 7192236.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 272.5, "completions/mean_terminated_length": 272.5, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.5432891743610331, "frac_reward_zero_std": 1.0, "grad_norm": 0.009047383197169816, "kl": 0.0006561279296875, "learning_rate": 5.611387648149063e-07, "loss": 0.0, "num_tokens": 7195340.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5435568044961863, "frac_reward_zero_std": 1.0, "grad_norm": 0.02109053971711779, "kl": 0.001544952392578125, "learning_rate": 5.607185151349342e-07, "loss": 0.0001, "num_tokens": 7198031.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 388.625, "completions/mean_terminated_length": 388.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.5438244346313394, "frac_reward_zero_std": 0.5, "grad_norm": 0.821579292969987, "kl": 0.00229644775390625, "learning_rate": 5.602982561013231e-07, "loss": -0.0152, "num_tokens": 7202244.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5440920647664927, "frac_reward_zero_std": 1.0, "grad_norm": 0.011841874722292008, "kl": 0.0009212493896484375, "learning_rate": 5.598779880808169e-07, "loss": 0.0, "num_tokens": 7205281.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 298.625, "completions/mean_terminated_length": 298.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.5443596949016459, "frac_reward_zero_std": 1.0, "grad_norm": 0.024603226319030693, "kl": 0.001506805419921875, "learning_rate": 5.594577114401677e-07, "loss": 0.0001, "num_tokens": 7208790.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 461.125, "completions/mean_terminated_length": 461.125, "completions/min_length": 369.0, "completions/min_terminated_length": 369.0, "epoch": 0.5446273250367991, "frac_reward_zero_std": 0.5, "grad_norm": 0.7140081841342383, "kl": 0.00263214111328125, "learning_rate": 5.590374265461346e-07, "loss": 0.0828, "num_tokens": 7213707.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.5448949551719524, "frac_reward_zero_std": 1.0, "grad_norm": 0.02084135299332178, "kl": 0.00226593017578125, "learning_rate": 5.586171337654845e-07, "loss": 0.0001, "num_tokens": 7217048.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5451625853071056, "frac_reward_zero_std": 1.0, "grad_norm": 0.032182540646999484, "kl": 0.001708984375, "learning_rate": 5.581968334649906e-07, "loss": 0.0001, "num_tokens": 7220893.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.5454302154422588, "frac_reward_zero_std": 1.0, "grad_norm": 0.02417660920724362, "kl": 0.0023040771484375, "learning_rate": 5.57776526011433e-07, "loss": 0.0001, "num_tokens": 7224484.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.545697845577412, "frac_reward_zero_std": 1.0, "grad_norm": 0.018057158859816198, "kl": 0.0013332366943359375, "learning_rate": 5.573562117715981e-07, "loss": 0.0001, "num_tokens": 7227300.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5459654757125653, "frac_reward_zero_std": 1.0, "grad_norm": 0.029300019018823814, "kl": 0.00262451171875, "learning_rate": 5.569358911122779e-07, "loss": 0.0001, "num_tokens": 7230360.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.5462331058477184, "frac_reward_zero_std": 0.5, "grad_norm": 1.0159310146464093, "kl": 0.00281524658203125, "learning_rate": 5.565155644002704e-07, "loss": 0.0123, "num_tokens": 7234260.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5465007359828716, "frac_reward_zero_std": 0.5, "grad_norm": 0.7778451853610724, "kl": 0.0023956298828125, "learning_rate": 5.560952320023785e-07, "loss": -0.0093, "num_tokens": 7237483.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 295.25, "completions/mean_terminated_length": 295.25, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.5467683661180249, "frac_reward_zero_std": 1.0, "grad_norm": 0.02579339981973544, "kl": 0.002593994140625, "learning_rate": 5.556748942854102e-07, "loss": 0.0001, "num_tokens": 7240929.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 202.0, "completions/mean_terminated_length": 202.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.5470359962531781, "frac_reward_zero_std": 1.0, "grad_norm": 0.018168994739267717, "kl": 0.001270294189453125, "learning_rate": 5.552545516161786e-07, "loss": 0.0001, "num_tokens": 7243553.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5473036263883313, "frac_reward_zero_std": 1.0, "grad_norm": 0.013995769439948394, "kl": 0.000858306884765625, "learning_rate": 5.548342043615e-07, "loss": 0.0, "num_tokens": 7246285.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 323.25, "completions/mean_terminated_length": 323.25, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.5475712565234846, "frac_reward_zero_std": 1.0, "grad_norm": 0.013665026896723368, "kl": 0.00128936767578125, "learning_rate": 5.54413852888196e-07, "loss": 0.0001, "num_tokens": 7250331.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.5478388866586378, "frac_reward_zero_std": 1.0, "grad_norm": 0.02038170687105278, "kl": 0.0014495849609375, "learning_rate": 5.539934975630909e-07, "loss": 0.0001, "num_tokens": 7253361.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 340.875, "completions/mean_terminated_length": 340.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.548106516793791, "frac_reward_zero_std": 1.0, "grad_norm": 0.00867975675680259, "kl": 0.0005931854248046875, "learning_rate": 5.535731387530132e-07, "loss": 0.0, "num_tokens": 7257220.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 606.0, "completions/max_terminated_length": 606.0, "completions/mean_length": 337.25, "completions/mean_terminated_length": 337.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5483741469289442, "frac_reward_zero_std": 1.0, "grad_norm": 0.05465821754296927, "kl": 0.0050048828125, "learning_rate": 5.531527768247934e-07, "loss": 0.0002, "num_tokens": 7261002.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 354.5, "completions/mean_terminated_length": 354.5, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.5486417770640974, "frac_reward_zero_std": 1.0, "grad_norm": 0.01768393784661969, "kl": 0.002288818359375, "learning_rate": 5.527324121452658e-07, "loss": 0.0001, "num_tokens": 7265086.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.5489094071992506, "frac_reward_zero_std": 1.0, "grad_norm": 0.028265792134631502, "kl": 0.0015163421630859375, "learning_rate": 5.523120450812665e-07, "loss": 0.0001, "num_tokens": 7269245.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5491770373344038, "frac_reward_zero_std": 1.0, "grad_norm": 0.0206033907410237, "kl": 0.0014667510986328125, "learning_rate": 5.518916759996336e-07, "loss": 0.0001, "num_tokens": 7272595.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 316.125, "completions/mean_terminated_length": 316.125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.5494446674695571, "frac_reward_zero_std": 1.0, "grad_norm": 0.030759842586101555, "kl": 0.00235748291015625, "learning_rate": 5.514713052672072e-07, "loss": 0.0001, "num_tokens": 7276348.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.5497122976047103, "frac_reward_zero_std": 0.5, "grad_norm": 0.8468460297624372, "kl": 0.0019989013671875, "learning_rate": 5.51050933250829e-07, "loss": 0.0001, "num_tokens": 7280449.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5499799277398635, "frac_reward_zero_std": 1.0, "grad_norm": 0.023910566436255703, "kl": 0.003326416015625, "learning_rate": 5.506305603173414e-07, "loss": 0.0001, "num_tokens": 7283776.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 205.5, "completions/mean_terminated_length": 205.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.5502475578750168, "frac_reward_zero_std": 1.0, "grad_norm": 0.02424649681769239, "kl": 0.001300811767578125, "learning_rate": 5.502101868335878e-07, "loss": 0.0001, "num_tokens": 7286484.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.55051518801017, "frac_reward_zero_std": 0.5, "grad_norm": 1.0018519900403524, "kl": 0.0082244873046875, "learning_rate": 5.497898131664122e-07, "loss": 0.0165, "num_tokens": 7290143.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 397.5, "completions/mean_terminated_length": 397.5, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.5507828181453231, "frac_reward_zero_std": 1.0, "grad_norm": 0.015843986758342726, "kl": 0.0012187957763671875, "learning_rate": 5.493694396826588e-07, "loss": 0.0, "num_tokens": 7294371.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.5510504482804763, "frac_reward_zero_std": 1.0, "grad_norm": 0.02603522517212546, "kl": 0.0019989013671875, "learning_rate": 5.48949066749171e-07, "loss": 0.0001, "num_tokens": 7297852.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 297.5, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.5513180784156296, "frac_reward_zero_std": 1.0, "grad_norm": 0.018550637052565454, "kl": 0.001861572265625, "learning_rate": 5.485286947327929e-07, "loss": 0.0001, "num_tokens": 7301228.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 221.125, "completions/mean_terminated_length": 221.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5515857085507828, "frac_reward_zero_std": 1.0, "grad_norm": 0.035661119251617655, "kl": 0.00212860107421875, "learning_rate": 5.481083240003665e-07, "loss": 0.0001, "num_tokens": 7304025.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 337.125, "completions/mean_terminated_length": 337.125, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.551853338685936, "frac_reward_zero_std": 1.0, "grad_norm": 0.02027927320307509, "kl": 0.001766204833984375, "learning_rate": 5.476879549187335e-07, "loss": 0.0001, "num_tokens": 7307814.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5521209688210893, "frac_reward_zero_std": 0.5, "grad_norm": 0.8764269282497804, "kl": 0.002044677734375, "learning_rate": 5.472675878547343e-07, "loss": 0.0118, "num_tokens": 7311036.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 431.0, "completions/mean_terminated_length": 431.0, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.5523885989562425, "frac_reward_zero_std": 1.0, "grad_norm": 0.014047472447345662, "kl": 0.001800537109375, "learning_rate": 5.468472231752065e-07, "loss": 0.0001, "num_tokens": 7315548.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 291.75, "completions/mean_terminated_length": 291.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5526562290913957, "frac_reward_zero_std": 1.0, "grad_norm": 0.013056968889780328, "kl": 0.0009937286376953125, "learning_rate": 5.464268612469869e-07, "loss": 0.0, "num_tokens": 7319010.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 241.5, "completions/mean_terminated_length": 241.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.552923859226549, "frac_reward_zero_std": 1.0, "grad_norm": 0.0179535334700199, "kl": 0.001651763916015625, "learning_rate": 5.460065024369089e-07, "loss": 0.0001, "num_tokens": 7322142.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.5531914893617021, "frac_reward_zero_std": 1.0, "grad_norm": 0.032738530492872876, "kl": 0.00299835205078125, "learning_rate": 5.455861471118041e-07, "loss": 0.0001, "num_tokens": 7325769.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 314.875, "completions/mean_terminated_length": 314.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5534591194968553, "frac_reward_zero_std": 1.0, "grad_norm": 0.028227008098353525, "kl": 0.0027313232421875, "learning_rate": 5.451657956385001e-07, "loss": 0.0001, "num_tokens": 7329324.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5537267496320085, "frac_reward_zero_std": 1.0, "grad_norm": 0.013091936980419834, "kl": 0.001522064208984375, "learning_rate": 5.447454483838216e-07, "loss": 0.0001, "num_tokens": 7333229.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 322.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.5539943797671618, "frac_reward_zero_std": 1.0, "grad_norm": 0.03137701543079504, "kl": 0.001708984375, "learning_rate": 5.443251057145897e-07, "loss": 0.0001, "num_tokens": 7336925.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 291.625, "completions/mean_terminated_length": 291.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.554262009902315, "frac_reward_zero_std": 1.0, "grad_norm": 0.02352853173336252, "kl": 0.001857757568359375, "learning_rate": 5.439047679976217e-07, "loss": 0.0001, "num_tokens": 7340274.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 432.375, "completions/mean_terminated_length": 432.375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5545296400374682, "frac_reward_zero_std": 1.0, "grad_norm": 0.038033092718971456, "kl": 0.00301361083984375, "learning_rate": 5.434844355997296e-07, "loss": 0.0001, "num_tokens": 7344765.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 236.875, "completions/mean_terminated_length": 236.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5547972701726215, "frac_reward_zero_std": 1.0, "grad_norm": 0.028227799289795, "kl": 0.00189971923828125, "learning_rate": 5.43064108887722e-07, "loss": 0.0001, "num_tokens": 7347808.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5550649003077747, "frac_reward_zero_std": 0.5, "grad_norm": 1.0542171804712863, "kl": 0.002899169921875, "learning_rate": 5.426437882284019e-07, "loss": -0.0469, "num_tokens": 7352201.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 320.875, "completions/mean_terminated_length": 320.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5553325304429279, "frac_reward_zero_std": 1.0, "grad_norm": 0.021295719576899675, "kl": 0.00231170654296875, "learning_rate": 5.422234739885671e-07, "loss": 0.0001, "num_tokens": 7355748.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.555600160578081, "frac_reward_zero_std": 1.0, "grad_norm": 0.010504659121777022, "kl": 0.0012664794921875, "learning_rate": 5.418031665350096e-07, "loss": 0.0001, "num_tokens": 7359219.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 307.125, "completions/mean_terminated_length": 307.125, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.5558677907132343, "frac_reward_zero_std": 1.0, "grad_norm": 0.013494483968330183, "kl": 0.00153350830078125, "learning_rate": 5.413828662345156e-07, "loss": 0.0001, "num_tokens": 7362768.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 308.625, "completions/mean_terminated_length": 308.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5561354208483875, "frac_reward_zero_std": 1.0, "grad_norm": 0.019906330283173058, "kl": 0.0020904541015625, "learning_rate": 5.409625734538654e-07, "loss": 0.0001, "num_tokens": 7366237.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5564030509835407, "frac_reward_zero_std": 1.0, "grad_norm": 0.025405638676469472, "kl": 0.001941680908203125, "learning_rate": 5.405422885598323e-07, "loss": 0.0001, "num_tokens": 7369483.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 294.75, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.556670681118694, "frac_reward_zero_std": 0.5, "grad_norm": 0.8324594086376372, "kl": 0.00099945068359375, "learning_rate": 5.401220119191831e-07, "loss": 0.0, "num_tokens": 7372969.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 349.375, "completions/mean_terminated_length": 349.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.5569383112538472, "frac_reward_zero_std": 0.5, "grad_norm": 0.906193159924491, "kl": 0.0020294189453125, "learning_rate": 5.39701743898677e-07, "loss": 0.0115, "num_tokens": 7377016.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5572059413890004, "frac_reward_zero_std": 1.0, "grad_norm": 0.020528084792553893, "kl": 0.001346588134765625, "learning_rate": 5.392814848650658e-07, "loss": 0.0001, "num_tokens": 7379643.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 345.25, "completions/mean_terminated_length": 345.25, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.5574735715241537, "frac_reward_zero_std": 0.5, "grad_norm": 0.5346546704386473, "kl": 0.0008068084716796875, "learning_rate": 5.388612351850938e-07, "loss": 0.0, "num_tokens": 7383485.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5577412016593069, "frac_reward_zero_std": 1.0, "grad_norm": 0.021950528107515153, "kl": 0.0014801025390625, "learning_rate": 5.384409952254963e-07, "loss": 0.0001, "num_tokens": 7386594.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 317.125, "completions/mean_terminated_length": 317.125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.55800883179446, "frac_reward_zero_std": 1.0, "grad_norm": 0.03916079215163071, "kl": 0.001583099365234375, "learning_rate": 5.380207653530013e-07, "loss": 0.0001, "num_tokens": 7390131.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5582764619296132, "frac_reward_zero_std": 0.5, "grad_norm": 0.9031340779661031, "kl": 0.001708984375, "learning_rate": 5.376005459343271e-07, "loss": -0.0158, "num_tokens": 7393696.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 247.25, "completions/mean_terminated_length": 247.25, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.5585440920647665, "frac_reward_zero_std": 1.0, "grad_norm": 0.015491657385608757, "kl": 0.000881195068359375, "learning_rate": 5.371803373361832e-07, "loss": 0.0, "num_tokens": 7396738.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 344.375, "completions/mean_terminated_length": 344.375, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5588117221999197, "frac_reward_zero_std": 1.0, "grad_norm": 0.01555945370230914, "kl": 0.001628875732421875, "learning_rate": 5.367601399252694e-07, "loss": 0.0001, "num_tokens": 7400781.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 271.0, "completions/mean_terminated_length": 271.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.5590793523350729, "frac_reward_zero_std": 1.0, "grad_norm": 0.014681758914338884, "kl": 0.0011138916015625, "learning_rate": 5.363399540682762e-07, "loss": 0.0, "num_tokens": 7403889.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.5593469824702262, "frac_reward_zero_std": 1.0, "grad_norm": 0.036808491141448185, "kl": 0.002349853515625, "learning_rate": 5.359197801318834e-07, "loss": 0.0001, "num_tokens": 7407424.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.5596146126053794, "frac_reward_zero_std": 1.0, "grad_norm": 0.10973114785464487, "kl": 0.0077972412109375, "learning_rate": 5.354996184827611e-07, "loss": 0.0003, "num_tokens": 7410525.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 530.375, "completions/mean_terminated_length": 530.375, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.5598822427405326, "frac_reward_zero_std": 0.5, "grad_norm": 0.5512809082830519, "kl": 0.0014190673828125, "learning_rate": 5.350794694875684e-07, "loss": 0.0259, "num_tokens": 7416304.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.5601498728756859, "frac_reward_zero_std": 1.0, "grad_norm": 0.01434657744309616, "kl": 0.0011749267578125, "learning_rate": 5.34659333512953e-07, "loss": 0.0, "num_tokens": 7419485.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.560417503010839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0184857544453694, "kl": 0.00135040283203125, "learning_rate": 5.342392109255518e-07, "loss": 0.0001, "num_tokens": 7422735.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5606851331459922, "frac_reward_zero_std": 1.0, "grad_norm": 0.020264845675713128, "kl": 0.001941680908203125, "learning_rate": 5.338191020919893e-07, "loss": 0.0001, "num_tokens": 7425758.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5609527632811454, "frac_reward_zero_std": 1.0, "grad_norm": 0.012953962643041191, "kl": 0.00148773193359375, "learning_rate": 5.333990073788792e-07, "loss": 0.0001, "num_tokens": 7429574.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 426.0, "completions/mean_terminated_length": 426.0, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.5612203934162987, "frac_reward_zero_std": 0.5, "grad_norm": 0.44646247450412585, "kl": 0.0004940032958984375, "learning_rate": 5.329789271528211e-07, "loss": 0.0, "num_tokens": 7433978.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 464.5, "completions/mean_terminated_length": 464.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.5614880235514519, "frac_reward_zero_std": 1.0, "grad_norm": 0.009591344001932237, "kl": 0.001026153564453125, "learning_rate": 5.325588617804039e-07, "loss": 0.0, "num_tokens": 7438894.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 312.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5617556536866051, "frac_reward_zero_std": 0.5, "grad_norm": 0.6413312600251104, "kl": 0.0009613037109375, "learning_rate": 5.321388116282022e-07, "loss": 0.0054, "num_tokens": 7442521.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 302.625, "completions/mean_terminated_length": 302.625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.5620232838217584, "frac_reward_zero_std": 1.0, "grad_norm": 0.018872544458366315, "kl": 0.001598358154296875, "learning_rate": 5.317187770627778e-07, "loss": 0.0001, "num_tokens": 7445882.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5622909139569116, "frac_reward_zero_std": 1.0, "grad_norm": 0.02269726776216952, "kl": 0.00213623046875, "learning_rate": 5.312987584506786e-07, "loss": 0.0001, "num_tokens": 7448458.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 282.375, "completions/mean_terminated_length": 282.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5625585440920647, "frac_reward_zero_std": 1.0, "grad_norm": 0.013683844319898945, "kl": 0.001129150390625, "learning_rate": 5.308787561584391e-07, "loss": 0.0, "num_tokens": 7451653.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 319.375, "completions/mean_terminated_length": 319.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.562826174227218, "frac_reward_zero_std": 0.5, "grad_norm": 0.9211578607493038, "kl": 0.00153350830078125, "learning_rate": 5.304587705525794e-07, "loss": -0.0206, "num_tokens": 7455388.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 493.0, "completions/mean_terminated_length": 493.0, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.5630938043623712, "frac_reward_zero_std": 0.5, "grad_norm": 0.7054555173742522, "kl": 0.00274658203125, "learning_rate": 5.300388019996045e-07, "loss": -0.0432, "num_tokens": 7460772.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 226.25, "completions/mean_terminated_length": 226.25, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.5633614344975244, "frac_reward_zero_std": 1.0, "grad_norm": 0.031009713150265086, "kl": 0.00331878662109375, "learning_rate": 5.296188508660053e-07, "loss": 0.0001, "num_tokens": 7463566.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5636290646326776, "frac_reward_zero_std": 1.0, "grad_norm": 0.028169399673837744, "kl": 0.00189208984375, "learning_rate": 5.291989175182568e-07, "loss": 0.0001, "num_tokens": 7466092.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 281.625, "completions/mean_terminated_length": 281.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.5638966947678309, "frac_reward_zero_std": 1.0, "grad_norm": 0.05813379266522341, "kl": 0.0027923583984375, "learning_rate": 5.287790023228192e-07, "loss": 0.0001, "num_tokens": 7469501.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5641643249029841, "frac_reward_zero_std": 1.0, "grad_norm": 0.035450926119608595, "kl": 0.00217437744140625, "learning_rate": 5.283591056461359e-07, "loss": 0.0001, "num_tokens": 7473309.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5644319550381373, "frac_reward_zero_std": 1.0, "grad_norm": 0.016231430459383626, "kl": 0.001708984375, "learning_rate": 5.279392278546351e-07, "loss": 0.0001, "num_tokens": 7476698.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.5646995851732906, "frac_reward_zero_std": 1.0, "grad_norm": 0.015416120811385946, "kl": 0.001861572265625, "learning_rate": 5.275193693147281e-07, "loss": 0.0001, "num_tokens": 7480347.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 358.25, "completions/mean_terminated_length": 358.25, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.5649672153084437, "frac_reward_zero_std": 1.0, "grad_norm": 0.027744568815098902, "kl": 0.0013751983642578125, "learning_rate": 5.270995303928092e-07, "loss": 0.0001, "num_tokens": 7484277.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5652348454435969, "frac_reward_zero_std": 1.0, "grad_norm": 0.017863268797436438, "kl": 0.001026153564453125, "learning_rate": 5.266797114552561e-07, "loss": 0.0, "num_tokens": 7487836.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.5655024755787502, "frac_reward_zero_std": 1.0, "grad_norm": 0.03852004764639346, "kl": 0.00213623046875, "learning_rate": 5.262599128684286e-07, "loss": 0.0001, "num_tokens": 7491368.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 329.75, "completions/mean_terminated_length": 329.75, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.5657701057139034, "frac_reward_zero_std": 1.0, "grad_norm": 0.012417752547325312, "kl": 0.001171112060546875, "learning_rate": 5.25840134998669e-07, "loss": 0.0, "num_tokens": 7495270.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 758.0, "completions/max_terminated_length": 758.0, "completions/mean_length": 410.5, "completions/mean_terminated_length": 410.5, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.5660377358490566, "frac_reward_zero_std": 1.0, "grad_norm": 0.012675376288263627, "kl": 0.001255035400390625, "learning_rate": 5.254203782123012e-07, "loss": 0.0001, "num_tokens": 7499594.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 217.75, "completions/mean_terminated_length": 217.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5663053659842098, "frac_reward_zero_std": 1.0, "grad_norm": 0.014425508168760459, "kl": 0.0009059906005859375, "learning_rate": 5.250006428756312e-07, "loss": 0.0, "num_tokens": 7502396.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 313.25, "completions/mean_terminated_length": 313.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5665729961193631, "frac_reward_zero_std": 0.5, "grad_norm": 0.8881034851167416, "kl": 0.0017547607421875, "learning_rate": 5.245809293549456e-07, "loss": 0.0001, "num_tokens": 7505790.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 277.75, "completions/mean_terminated_length": 277.75, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5668406262545163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0191311105891598, "kl": 0.00189208984375, "learning_rate": 5.24161238016513e-07, "loss": 0.0001, "num_tokens": 7508956.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 364.625, "completions/mean_terminated_length": 364.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5671082563896694, "frac_reward_zero_std": 0.5, "grad_norm": 0.5946873810129523, "kl": 0.002056121826171875, "learning_rate": 5.237415692265814e-07, "loss": 0.0465, "num_tokens": 7512885.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 351.25, "completions/mean_terminated_length": 351.25, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5673758865248227, "frac_reward_zero_std": 1.0, "grad_norm": 0.03569460714460151, "kl": 0.0034503936767578125, "learning_rate": 5.233219233513801e-07, "loss": 0.0001, "num_tokens": 7516723.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 691.0, "completions/max_terminated_length": 691.0, "completions/mean_length": 397.375, "completions/mean_terminated_length": 397.375, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5676435166599759, "frac_reward_zero_std": 1.0, "grad_norm": 0.01786302660038867, "kl": 0.0018463134765625, "learning_rate": 5.229023007571178e-07, "loss": 0.0001, "num_tokens": 7520898.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.5679111467951291, "frac_reward_zero_std": 1.0, "grad_norm": 0.026430610601872878, "kl": 0.00199127197265625, "learning_rate": 5.224827018099834e-07, "loss": 0.0001, "num_tokens": 7524468.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 307.125, "completions/mean_terminated_length": 307.125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.5681787769302824, "frac_reward_zero_std": 1.0, "grad_norm": 0.013265977266955475, "kl": 0.00130462646484375, "learning_rate": 5.220631268761447e-07, "loss": 0.0001, "num_tokens": 7527989.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.5684464070654356, "frac_reward_zero_std": 1.0, "grad_norm": 0.018610678841658942, "kl": 0.0018463134765625, "learning_rate": 5.216435763217486e-07, "loss": 0.0001, "num_tokens": 7531344.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 337.25, "completions/mean_terminated_length": 337.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.5687140372005888, "frac_reward_zero_std": 1.0, "grad_norm": 0.019293186746616734, "kl": 0.0017852783203125, "learning_rate": 5.212240505129212e-07, "loss": 0.0001, "num_tokens": 7535214.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.568981667335742, "frac_reward_zero_std": 1.0, "grad_norm": 0.013479525396675022, "kl": 0.001026153564453125, "learning_rate": 5.208045498157663e-07, "loss": 0.0, "num_tokens": 7537898.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 219.5, "completions/mean_terminated_length": 219.5, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.5692492974708953, "frac_reward_zero_std": 1.0, "grad_norm": 0.017544851465434403, "kl": 0.00128173828125, "learning_rate": 5.203850745963665e-07, "loss": 0.0001, "num_tokens": 7540666.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 334.0, "completions/mean_terminated_length": 334.0, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.5695169276060484, "frac_reward_zero_std": 1.0, "grad_norm": 0.011318033595531952, "kl": 0.00079345703125, "learning_rate": 5.199656252207815e-07, "loss": 0.0, "num_tokens": 7544314.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5697845577412016, "frac_reward_zero_std": 1.0, "grad_norm": 0.023641682158003777, "kl": 0.00196075439453125, "learning_rate": 5.195462020550489e-07, "loss": 0.0001, "num_tokens": 7547490.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 399.875, "completions/mean_terminated_length": 399.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5700521878763549, "frac_reward_zero_std": 1.0, "grad_norm": 0.01480731383668205, "kl": 0.00159454345703125, "learning_rate": 5.191268054651832e-07, "loss": 0.0001, "num_tokens": 7551797.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 331.875, "completions/mean_terminated_length": 331.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5703198180115081, "frac_reward_zero_std": 1.0, "grad_norm": 0.02668698488906455, "kl": 0.0019683837890625, "learning_rate": 5.187074358171759e-07, "loss": 0.0001, "num_tokens": 7555400.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.5705874481466613, "frac_reward_zero_std": 1.0, "grad_norm": 0.017352825660420748, "kl": 0.00164031982421875, "learning_rate": 5.182880934769948e-07, "loss": 0.0001, "num_tokens": 7558801.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5708550782818146, "frac_reward_zero_std": 0.5, "grad_norm": 1.033644771653551, "kl": 0.00200653076171875, "learning_rate": 5.178687788105841e-07, "loss": 0.0521, "num_tokens": 7561881.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 367.25, "completions/mean_terminated_length": 367.25, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5711227084169678, "frac_reward_zero_std": 1.0, "grad_norm": 0.013712190662772088, "kl": 0.001583099365234375, "learning_rate": 5.174494921838637e-07, "loss": 0.0001, "num_tokens": 7565867.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 314.75, "completions/mean_terminated_length": 314.75, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.571390338552121, "frac_reward_zero_std": 1.0, "grad_norm": 0.03453641001637699, "kl": 0.0017547607421875, "learning_rate": 5.170302339627287e-07, "loss": 0.0001, "num_tokens": 7569389.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5716579686872741, "frac_reward_zero_std": 1.0, "grad_norm": 0.014637323207064519, "kl": 0.001438140869140625, "learning_rate": 5.166110045130501e-07, "loss": 0.0001, "num_tokens": 7572140.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5719255988224274, "frac_reward_zero_std": 1.0, "grad_norm": 0.011526028682063835, "kl": 0.001049041748046875, "learning_rate": 5.161918042006733e-07, "loss": 0.0, "num_tokens": 7575058.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5721932289575806, "frac_reward_zero_std": 1.0, "grad_norm": 0.018583604693291306, "kl": 0.00136566162109375, "learning_rate": 5.157726333914185e-07, "loss": 0.0001, "num_tokens": 7578314.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5724608590927338, "frac_reward_zero_std": 1.0, "grad_norm": 0.014305593517460271, "kl": 0.001575469970703125, "learning_rate": 5.153534924510798e-07, "loss": 0.0001, "num_tokens": 7581930.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 240.875, "completions/mean_terminated_length": 240.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.5727284892278871, "frac_reward_zero_std": 1.0, "grad_norm": 0.01580617416405494, "kl": 0.001251220703125, "learning_rate": 5.149343817454258e-07, "loss": 0.0001, "num_tokens": 7585037.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5729961193630403, "frac_reward_zero_std": 1.0, "grad_norm": 0.01613381239943204, "kl": 0.00151824951171875, "learning_rate": 5.145153016401984e-07, "loss": 0.0001, "num_tokens": 7588673.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.5732637494981935, "frac_reward_zero_std": 1.0, "grad_norm": 0.02321639688602887, "kl": 0.0015411376953125, "learning_rate": 5.140962525011126e-07, "loss": 0.0001, "num_tokens": 7591949.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5735313796333467, "frac_reward_zero_std": 1.0, "grad_norm": 0.01719854196180053, "kl": 0.002208709716796875, "learning_rate": 5.136772346938568e-07, "loss": 0.0001, "num_tokens": 7595959.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.5737990097685, "frac_reward_zero_std": 1.0, "grad_norm": 0.04582024061284506, "kl": 0.0020599365234375, "learning_rate": 5.132582485840917e-07, "loss": 0.0001, "num_tokens": 7599144.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.5740666399036531, "frac_reward_zero_std": 1.0, "grad_norm": 0.012948097603268347, "kl": 0.000667572021484375, "learning_rate": 5.128392945374505e-07, "loss": 0.0, "num_tokens": 7601881.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 272.0, "completions/max_terminated_length": 272.0, "completions/mean_length": 249.5, "completions/mean_terminated_length": 249.5, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5743342700388063, "frac_reward_zero_std": 1.0, "grad_norm": 0.014355951153329247, "kl": 0.00115203857421875, "learning_rate": 5.124203729195384e-07, "loss": 0.0, "num_tokens": 7604889.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.5746019001739596, "frac_reward_zero_std": 1.0, "grad_norm": 0.029855403888888337, "kl": 0.00228118896484375, "learning_rate": 5.120014840959324e-07, "loss": 0.0001, "num_tokens": 7608305.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 369.5, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.5748695303091128, "frac_reward_zero_std": 1.0, "grad_norm": 0.018747389927047924, "kl": 0.001514434814453125, "learning_rate": 5.115826284321807e-07, "loss": 0.0001, "num_tokens": 7612273.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.575137160444266, "frac_reward_zero_std": 1.0, "grad_norm": 0.020943227796517503, "kl": 0.0013275146484375, "learning_rate": 5.111638062938029e-07, "loss": 0.0001, "num_tokens": 7615897.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 198.625, "completions/mean_terminated_length": 198.625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.5754047905794193, "frac_reward_zero_std": 1.0, "grad_norm": 0.03623028028557143, "kl": 0.001773834228515625, "learning_rate": 5.107450180462884e-07, "loss": 0.0001, "num_tokens": 7618422.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 356.375, "completions/mean_terminated_length": 356.375, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.5756724207145725, "frac_reward_zero_std": 1.0, "grad_norm": 0.01140976169582109, "kl": 0.0009918212890625, "learning_rate": 5.103262640550986e-07, "loss": 0.0, "num_tokens": 7622417.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 251.75, "completions/mean_terminated_length": 251.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.5759400508497257, "frac_reward_zero_std": 1.0, "grad_norm": 0.015439856363971145, "kl": 0.000850677490234375, "learning_rate": 5.099075446856636e-07, "loss": 0.0, "num_tokens": 7625435.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5762076809848788, "frac_reward_zero_std": 1.0, "grad_norm": 0.022593198849967543, "kl": 0.00189971923828125, "learning_rate": 5.094888603033838e-07, "loss": 0.0001, "num_tokens": 7628696.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 189.125, "completions/mean_terminated_length": 189.125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.5764753111200321, "frac_reward_zero_std": 1.0, "grad_norm": 0.010599206808495479, "kl": 0.0006084442138671875, "learning_rate": 5.090702112736294e-07, "loss": 0.0, "num_tokens": 7631077.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 311.5, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.5767429412551853, "frac_reward_zero_std": 1.0, "grad_norm": 0.020385133780037732, "kl": 0.00234222412109375, "learning_rate": 5.086515979617391e-07, "loss": 0.0001, "num_tokens": 7634773.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 402.375, "completions/mean_terminated_length": 402.375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.5770105713903385, "frac_reward_zero_std": 0.5, "grad_norm": 0.6438826270246859, "kl": 0.001617431640625, "learning_rate": 5.082330207330209e-07, "loss": 0.0079, "num_tokens": 7639028.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5772782015254918, "frac_reward_zero_std": 1.0, "grad_norm": 0.012713647597083762, "kl": 0.00087738037109375, "learning_rate": 5.078144799527512e-07, "loss": 0.0, "num_tokens": 7642449.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.577545831660645, "frac_reward_zero_std": 1.0, "grad_norm": 0.13076269917550637, "kl": 0.00624847412109375, "learning_rate": 5.073959759861745e-07, "loss": 0.0003, "num_tokens": 7645363.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5778134617957982, "frac_reward_zero_std": 1.0, "grad_norm": 0.011360994221517871, "kl": 0.0007305145263671875, "learning_rate": 5.069775091985031e-07, "loss": 0.0, "num_tokens": 7648412.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 317.875, "completions/mean_terminated_length": 317.875, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.5780810919309515, "frac_reward_zero_std": 1.0, "grad_norm": 0.01152201692076831, "kl": 0.0007877349853515625, "learning_rate": 5.065590799549172e-07, "loss": 0.0, "num_tokens": 7651923.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.5783487220661047, "frac_reward_zero_std": 1.0, "grad_norm": 0.15974286947242947, "kl": 0.003265380859375, "learning_rate": 5.061406886205641e-07, "loss": 0.0001, "num_tokens": 7654968.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.5786163522012578, "frac_reward_zero_std": 1.0, "grad_norm": 0.023340453881526183, "kl": 0.0011949539184570312, "learning_rate": 5.057223355605577e-07, "loss": 0.0, "num_tokens": 7658209.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 318.625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.578883982336411, "frac_reward_zero_std": 1.0, "grad_norm": 0.020073980039739616, "kl": 0.00191497802734375, "learning_rate": 5.053040211399792e-07, "loss": 0.0001, "num_tokens": 7661898.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.5791516124715643, "frac_reward_zero_std": 1.0, "grad_norm": 0.021644828729285068, "kl": 0.002410888671875, "learning_rate": 5.048857457238748e-07, "loss": 0.0001, "num_tokens": 7665075.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 639.0, "completions/max_terminated_length": 639.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5794192426067175, "frac_reward_zero_std": 1.0, "grad_norm": 0.0151869008161279, "kl": 0.0010833740234375, "learning_rate": 5.044675096772583e-07, "loss": 0.0, "num_tokens": 7668611.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 313.5, "completions/mean_terminated_length": 313.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5796868727418707, "frac_reward_zero_std": 1.0, "grad_norm": 0.01636005987375576, "kl": 0.001453399658203125, "learning_rate": 5.040493133651078e-07, "loss": 0.0001, "num_tokens": 7672191.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 363.5, "completions/mean_terminated_length": 363.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.579954502877024, "frac_reward_zero_std": 1.0, "grad_norm": 0.016822509876734497, "kl": 0.00164794921875, "learning_rate": 5.036311571523675e-07, "loss": 0.0001, "num_tokens": 7676151.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.5802221330121772, "frac_reward_zero_std": 1.0, "grad_norm": 0.02508473909640181, "kl": 0.0012054443359375, "learning_rate": 5.032130414039466e-07, "loss": 0.0, "num_tokens": 7679191.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.5804897631473304, "frac_reward_zero_std": 1.0, "grad_norm": 0.019741320693404892, "kl": 0.0014495849609375, "learning_rate": 5.027949664847182e-07, "loss": 0.0001, "num_tokens": 7682214.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 347.625, "completions/mean_terminated_length": 347.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.5807573932824837, "frac_reward_zero_std": 1.0, "grad_norm": 0.01582377721077564, "kl": 0.00173187255859375, "learning_rate": 5.023769327595208e-07, "loss": 0.0001, "num_tokens": 7686099.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.5810250234176368, "frac_reward_zero_std": 1.0, "grad_norm": 0.022852326722199487, "kl": 0.001430511474609375, "learning_rate": 5.01958940593156e-07, "loss": 0.0001, "num_tokens": 7689618.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 441.0, "completions/mean_terminated_length": 441.0, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.58129265355279, "frac_reward_zero_std": 0.5, "grad_norm": 0.6746935405989378, "kl": 0.001262664794921875, "learning_rate": 5.015409903503902e-07, "loss": -0.041, "num_tokens": 7694538.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 400.875, "completions/mean_terminated_length": 400.875, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.5815602836879432, "frac_reward_zero_std": 1.0, "grad_norm": 0.020773383409757703, "kl": 0.00228118896484375, "learning_rate": 5.011230823959522e-07, "loss": 0.0001, "num_tokens": 7698845.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 343.875, "completions/mean_terminated_length": 343.875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.5818279138230965, "frac_reward_zero_std": 1.0, "grad_norm": 0.01637420420670064, "kl": 0.0015869140625, "learning_rate": 5.007052170945344e-07, "loss": 0.0001, "num_tokens": 7702908.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.5820955439582497, "frac_reward_zero_std": 1.0, "grad_norm": 0.020327051720630894, "kl": 0.00136566162109375, "learning_rate": 5.002873948107919e-07, "loss": 0.0001, "num_tokens": 7706625.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.5823631740934029, "frac_reward_zero_std": 1.0, "grad_norm": 0.009583543958481624, "kl": 0.00086212158203125, "learning_rate": 4.998696159093427e-07, "loss": 0.0, "num_tokens": 7709865.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 314.125, "completions/mean_terminated_length": 314.125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.5826308042285562, "frac_reward_zero_std": 1.0, "grad_norm": 0.02161391351618164, "kl": 0.002044677734375, "learning_rate": 4.994518807547655e-07, "loss": 0.0001, "num_tokens": 7713422.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 233.375, "completions/mean_terminated_length": 233.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.5828984343637094, "frac_reward_zero_std": 1.0, "grad_norm": 0.009435964400910483, "kl": 0.0005130767822265625, "learning_rate": 4.990341897116027e-07, "loss": 0.0, "num_tokens": 7716329.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.5831660644988625, "frac_reward_zero_std": 1.0, "grad_norm": 0.016506448560787333, "kl": 0.00220489501953125, "learning_rate": 4.986165431443569e-07, "loss": 0.0001, "num_tokens": 7719901.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.5834336946340158, "frac_reward_zero_std": 1.0, "grad_norm": 0.012438205072418804, "kl": 0.001140594482421875, "learning_rate": 4.981989414174926e-07, "loss": 0.0, "num_tokens": 7723138.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 220.5, "completions/mean_terminated_length": 220.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.583701324769169, "frac_reward_zero_std": 1.0, "grad_norm": 0.037063101851803504, "kl": 0.0027008056640625, "learning_rate": 4.977813848954349e-07, "loss": 0.0001, "num_tokens": 7725878.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 203.125, "completions/mean_terminated_length": 203.125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.5839689549043222, "frac_reward_zero_std": 1.0, "grad_norm": 0.013375654113831119, "kl": 0.0009613037109375, "learning_rate": 4.97363873942569e-07, "loss": 0.0, "num_tokens": 7728435.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 365.375, "completions/mean_terminated_length": 365.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.5842365850394754, "frac_reward_zero_std": 1.0, "grad_norm": 0.0160057652878766, "kl": 0.00215911865234375, "learning_rate": 4.969464089232412e-07, "loss": 0.0001, "num_tokens": 7732378.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 249.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.5845042151746287, "frac_reward_zero_std": 1.0, "grad_norm": 0.0228398307849742, "kl": 0.002166748046875, "learning_rate": 4.96528990201757e-07, "loss": 0.0001, "num_tokens": 7735249.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5847718453097819, "frac_reward_zero_std": 1.0, "grad_norm": 0.01670416067151448, "kl": 0.001537322998046875, "learning_rate": 4.961116181423821e-07, "loss": 0.0001, "num_tokens": 7739441.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 224.125, "completions/mean_terminated_length": 224.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.5850394754449351, "frac_reward_zero_std": 1.0, "grad_norm": 0.010523747566826839, "kl": 0.000675201416015625, "learning_rate": 4.956942931093409e-07, "loss": 0.0, "num_tokens": 7742214.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 325.375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5853071055800884, "frac_reward_zero_std": 0.5, "grad_norm": 0.8609983646124049, "kl": 0.001659393310546875, "learning_rate": 4.952770154668172e-07, "loss": 0.0139, "num_tokens": 7745817.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 264.25, "completions/mean_terminated_length": 264.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.5855747357152415, "frac_reward_zero_std": 1.0, "grad_norm": 0.014877164991784469, "kl": 0.0014495849609375, "learning_rate": 4.94859785578953e-07, "loss": 0.0001, "num_tokens": 7749019.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.5858423658503947, "frac_reward_zero_std": 1.0, "grad_norm": 0.02171153887438732, "kl": 0.0020599365234375, "learning_rate": 4.944426038098492e-07, "loss": 0.0001, "num_tokens": 7752286.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.586109995985548, "frac_reward_zero_std": 1.0, "grad_norm": 0.029032234352623337, "kl": 0.00237274169921875, "learning_rate": 4.940254705235642e-07, "loss": 0.0001, "num_tokens": 7755262.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.5863776261207012, "frac_reward_zero_std": 1.0, "grad_norm": 0.047025124820094855, "kl": 0.002349853515625, "learning_rate": 4.936083860841146e-07, "loss": 0.0001, "num_tokens": 7758211.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 378.5, "completions/mean_terminated_length": 378.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.5866452562558544, "frac_reward_zero_std": 1.0, "grad_norm": 0.01500952954060675, "kl": 0.001537322998046875, "learning_rate": 4.931913508554737e-07, "loss": 0.0001, "num_tokens": 7762267.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.5869128863910076, "frac_reward_zero_std": 1.0, "grad_norm": 0.012946029823643169, "kl": 0.00130462646484375, "learning_rate": 4.927743652015724e-07, "loss": 0.0001, "num_tokens": 7765455.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 235.25, "completions/mean_terminated_length": 235.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.5871805165261609, "frac_reward_zero_std": 1.0, "grad_norm": 0.015921505710789714, "kl": 0.001255035400390625, "learning_rate": 4.923574294862981e-07, "loss": 0.0001, "num_tokens": 7768353.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5874481466613141, "frac_reward_zero_std": 1.0, "grad_norm": 0.01162924319813791, "kl": 0.0006837844848632812, "learning_rate": 4.919405440734947e-07, "loss": 0.0, "num_tokens": 7771434.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 395.5, "completions/mean_terminated_length": 395.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.5877157767964672, "frac_reward_zero_std": 1.0, "grad_norm": 0.03785662912147166, "kl": 0.00272369384765625, "learning_rate": 4.915237093269623e-07, "loss": 0.0001, "num_tokens": 7775918.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5879834069316205, "frac_reward_zero_std": 1.0, "grad_norm": 0.022134251489808656, "kl": 0.002193450927734375, "learning_rate": 4.911069256104564e-07, "loss": 0.0001, "num_tokens": 7779021.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 239.0, "completions/mean_terminated_length": 239.0, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.5882510370667737, "frac_reward_zero_std": 1.0, "grad_norm": 0.02491660351279363, "kl": 0.001491546630859375, "learning_rate": 4.906901932876887e-07, "loss": 0.0001, "num_tokens": 7781973.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5885186672019269, "frac_reward_zero_std": 0.5, "grad_norm": 0.9546033972447258, "kl": 0.0020751953125, "learning_rate": 4.902735127223251e-07, "loss": 0.0617, "num_tokens": 7785735.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 580.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 405.5, "completions/mean_terminated_length": 405.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.5887862973370801, "frac_reward_zero_std": 1.0, "grad_norm": 0.016037359641517934, "kl": 0.001739501953125, "learning_rate": 4.898568842779869e-07, "loss": 0.0001, "num_tokens": 7790283.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 296.5, "completions/mean_terminated_length": 296.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.5890539274722334, "frac_reward_zero_std": 1.0, "grad_norm": 0.01888763212711682, "kl": 0.000766754150390625, "learning_rate": 4.894403083182504e-07, "loss": 0.0, "num_tokens": 7793719.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 334.875, "completions/mean_terminated_length": 334.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.5893215576073866, "frac_reward_zero_std": 1.0, "grad_norm": 0.01646534387851531, "kl": 0.0013675689697265625, "learning_rate": 4.890237852066448e-07, "loss": 0.0001, "num_tokens": 7797578.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 218.625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5895891877425398, "frac_reward_zero_std": 1.0, "grad_norm": 0.01162920313758298, "kl": 0.000919342041015625, "learning_rate": 4.886073153066544e-07, "loss": 0.0, "num_tokens": 7800207.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 314.375, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.5898568178776931, "frac_reward_zero_std": 1.0, "grad_norm": 0.016031082565207404, "kl": 0.001251220703125, "learning_rate": 4.881908989817163e-07, "loss": 0.0, "num_tokens": 7803778.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.5901244480128462, "frac_reward_zero_std": 1.0, "grad_norm": 0.03084880298347723, "kl": 0.0024871826171875, "learning_rate": 4.877745365952214e-07, "loss": 0.0001, "num_tokens": 7806942.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.5903920781479994, "frac_reward_zero_std": 1.0, "grad_norm": 0.017704299442061137, "kl": 0.00138092041015625, "learning_rate": 4.873582285105129e-07, "loss": 0.0001, "num_tokens": 7810143.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 325.625, "completions/mean_terminated_length": 325.625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.5906597082831527, "frac_reward_zero_std": 1.0, "grad_norm": 0.015018842845142545, "kl": 0.00107574462890625, "learning_rate": 4.869419750908873e-07, "loss": 0.0, "num_tokens": 7813884.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 251.625, "completions/mean_terminated_length": 251.625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.5909273384183059, "frac_reward_zero_std": 1.0, "grad_norm": 0.016823629851086624, "kl": 0.0016937255859375, "learning_rate": 4.865257766995928e-07, "loss": 0.0001, "num_tokens": 7817025.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.5911949685534591, "frac_reward_zero_std": 1.0, "grad_norm": 0.01893565128700647, "kl": 0.00153350830078125, "learning_rate": 4.861096336998302e-07, "loss": 0.0001, "num_tokens": 7819971.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5914625986886123, "frac_reward_zero_std": 1.0, "grad_norm": 0.019038582691551485, "kl": 0.0015716552734375, "learning_rate": 4.856935464547515e-07, "loss": 0.0001, "num_tokens": 7822935.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "epoch": 0.5917302288237656, "frac_reward_zero_std": 1.0, "grad_norm": 0.018641031558166755, "kl": 0.0008296966552734375, "learning_rate": 4.852775153274597e-07, "loss": 0.0, "num_tokens": 7825775.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 329.25, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.5919978589589188, "frac_reward_zero_std": 1.0, "grad_norm": 0.01508020899104005, "kl": 0.0011844635009765625, "learning_rate": 4.8486154068101e-07, "loss": 0.0, "num_tokens": 7829485.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.592265489094072, "frac_reward_zero_std": 1.0, "grad_norm": 0.01354158335759168, "kl": 0.0009365081787109375, "learning_rate": 4.844456228784069e-07, "loss": 0.0, "num_tokens": 7832547.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 242.375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.5925331192292252, "frac_reward_zero_std": 1.0, "grad_norm": 0.013197283131194327, "kl": 0.00080108642578125, "learning_rate": 4.840297622826066e-07, "loss": 0.0, "num_tokens": 7835618.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 197.625, "completions/mean_terminated_length": 197.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5928007493643784, "frac_reward_zero_std": 1.0, "grad_norm": 0.2002431638446675, "kl": 0.0056304931640625, "learning_rate": 4.836139592565144e-07, "loss": 0.0002, "num_tokens": 7838131.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 300.625, "completions/mean_terminated_length": 300.625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5930683794995316, "frac_reward_zero_std": 0.5, "grad_norm": 0.9744706347792211, "kl": 0.003173828125, "learning_rate": 4.83198214162986e-07, "loss": -0.0115, "num_tokens": 7841688.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.5933360096346849, "frac_reward_zero_std": 1.0, "grad_norm": 0.022391068058666617, "kl": 0.00196075439453125, "learning_rate": 4.82782527364826e-07, "loss": 0.0001, "num_tokens": 7844912.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.5936036397698381, "frac_reward_zero_std": 1.0, "grad_norm": 0.015533449214130897, "kl": 0.0010089874267578125, "learning_rate": 4.823668992247885e-07, "loss": 0.0, "num_tokens": 7848026.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.5938712699049913, "frac_reward_zero_std": 1.0, "grad_norm": 0.007778534928141156, "kl": 0.0007076263427734375, "learning_rate": 4.81951330105576e-07, "loss": 0.0, "num_tokens": 7851732.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 380.875, "completions/mean_terminated_length": 380.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.5941389000401445, "frac_reward_zero_std": 1.0, "grad_norm": 0.013951586843001695, "kl": 0.001293182373046875, "learning_rate": 4.815358203698405e-07, "loss": 0.0001, "num_tokens": 7855903.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.5944065301752978, "frac_reward_zero_std": 1.0, "grad_norm": 0.017752506358335963, "kl": 0.001117706298828125, "learning_rate": 4.81120370380181e-07, "loss": 0.0, "num_tokens": 7858684.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.594674160310451, "frac_reward_zero_std": 1.0, "grad_norm": 0.021310409773958518, "kl": 0.00173187255859375, "learning_rate": 4.807049804991449e-07, "loss": 0.0001, "num_tokens": 7862020.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 347.75, "completions/mean_terminated_length": 347.75, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.5949417904456041, "frac_reward_zero_std": 1.0, "grad_norm": 0.025264701438754845, "kl": 0.00223541259765625, "learning_rate": 4.802896510892273e-07, "loss": 0.0001, "num_tokens": 7865874.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 319.0, "completions/mean_terminated_length": 319.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.5952094205807574, "frac_reward_zero_std": 1.0, "grad_norm": 0.020479649749625965, "kl": 0.001888275146484375, "learning_rate": 4.798743825128701e-07, "loss": 0.0001, "num_tokens": 7869514.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 294.75, "completions/mean_terminated_length": 294.75, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.5954770507159106, "frac_reward_zero_std": 1.0, "grad_norm": 0.014441915217639467, "kl": 0.00125885009765625, "learning_rate": 4.794591751324625e-07, "loss": 0.0001, "num_tokens": 7872884.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.5957446808510638, "frac_reward_zero_std": 1.0, "grad_norm": 0.012692929468232876, "kl": 0.001094818115234375, "learning_rate": 4.790440293103398e-07, "loss": 0.0, "num_tokens": 7876524.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 243.125, "completions/mean_terminated_length": 243.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.5960123109862171, "frac_reward_zero_std": 1.0, "grad_norm": 0.042790918482355264, "kl": 0.002460479736328125, "learning_rate": 4.786289454087844e-07, "loss": 0.0001, "num_tokens": 7879501.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 204.25, "completions/mean_terminated_length": 204.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5962799411213703, "frac_reward_zero_std": 1.0, "grad_norm": 0.01903070774911535, "kl": 0.0015411376953125, "learning_rate": 4.782139237900237e-07, "loss": 0.0001, "num_tokens": 7882243.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 303.75, "completions/mean_terminated_length": 303.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.5965475712565235, "frac_reward_zero_std": 1.0, "grad_norm": 0.016830388776176066, "kl": 0.001316070556640625, "learning_rate": 4.777989648162316e-07, "loss": 0.0001, "num_tokens": 7885565.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 222.875, "completions/mean_terminated_length": 222.875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.5968152013916767, "frac_reward_zero_std": 1.0, "grad_norm": 0.020054601844720004, "kl": 0.002010345458984375, "learning_rate": 4.773840688495268e-07, "loss": 0.0001, "num_tokens": 7888444.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.59708283152683, "frac_reward_zero_std": 1.0, "grad_norm": 0.020632095271400964, "kl": 0.002410888671875, "learning_rate": 4.769692362519731e-07, "loss": 0.0001, "num_tokens": 7892168.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 313.125, "completions/mean_terminated_length": 313.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.5973504616619831, "frac_reward_zero_std": 1.0, "grad_norm": 0.01378457219339036, "kl": 0.00136566162109375, "learning_rate": 4.7655446738557926e-07, "loss": 0.0001, "num_tokens": 7895829.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 313.875, "completions/mean_terminated_length": 313.875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.5976180917971363, "frac_reward_zero_std": 0.5, "grad_norm": 0.9213983297719952, "kl": 0.000858306884765625, "learning_rate": 4.7613976261229805e-07, "loss": 0.036, "num_tokens": 7899520.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.5978857219322896, "frac_reward_zero_std": 0.5, "grad_norm": 0.6607132904184592, "kl": 0.001117706298828125, "learning_rate": 4.757251222940267e-07, "loss": -0.0526, "num_tokens": 7902961.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 266.375, "completions/mean_terminated_length": 266.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.5981533520674428, "frac_reward_zero_std": 1.0, "grad_norm": 0.01070859586740411, "kl": 0.000797271728515625, "learning_rate": 4.7531054679260574e-07, "loss": 0.0, "num_tokens": 7906188.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.598420982202596, "frac_reward_zero_std": 1.0, "grad_norm": 0.020045539754800046, "kl": 0.0016841888427734375, "learning_rate": 4.7489603646981955e-07, "loss": 0.0001, "num_tokens": 7909549.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 206.25, "completions/mean_terminated_length": 206.25, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.5986886123377493, "frac_reward_zero_std": 1.0, "grad_norm": 0.025662303847411964, "kl": 0.001506805419921875, "learning_rate": 4.744815916873952e-07, "loss": 0.0001, "num_tokens": 7912231.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 352.5, "completions/mean_terminated_length": 352.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.5989562424729025, "frac_reward_zero_std": 0.5, "grad_norm": 1.055736872458477, "kl": 0.001506805419921875, "learning_rate": 4.7406721280700334e-07, "loss": 0.0258, "num_tokens": 7916287.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 362.0, "completions/mean_terminated_length": 362.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.5992238726080557, "frac_reward_zero_std": 1.0, "grad_norm": 0.02171418111598737, "kl": 0.00160980224609375, "learning_rate": 4.736529001902563e-07, "loss": 0.0001, "num_tokens": 7920303.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.5994915027432088, "frac_reward_zero_std": 1.0, "grad_norm": 0.01135911822124106, "kl": 0.00122833251953125, "learning_rate": 4.732386541987087e-07, "loss": 0.0, "num_tokens": 7924209.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 404.625, "completions/mean_terminated_length": 404.625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.5997591328783621, "frac_reward_zero_std": 1.0, "grad_norm": 0.016593643786020847, "kl": 0.001720428466796875, "learning_rate": 4.728244751938576e-07, "loss": 0.0001, "num_tokens": 7928454.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 342.0, "completions/mean_terminated_length": 342.0, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.6000267630135153, "frac_reward_zero_std": 0.5, "grad_norm": 0.8717981453162231, "kl": 0.0021209716796875, "learning_rate": 4.7241036353714083e-07, "loss": 0.0309, "num_tokens": 7932330.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 356.875, "completions/mean_terminated_length": 356.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6002943931486685, "frac_reward_zero_std": 1.0, "grad_norm": 0.04082884227467662, "kl": 0.003387451171875, "learning_rate": 4.7199631958993815e-07, "loss": 0.0001, "num_tokens": 7936357.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 177.875, "completions/mean_terminated_length": 177.875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6005620232838218, "frac_reward_zero_std": 1.0, "grad_norm": 0.03321612924857274, "kl": 0.002655029296875, "learning_rate": 4.7158234371356974e-07, "loss": 0.0001, "num_tokens": 7938588.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 303.25, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.600829653418975, "frac_reward_zero_std": 1.0, "grad_norm": 0.015333029724843964, "kl": 0.001735687255859375, "learning_rate": 4.711684362692965e-07, "loss": 0.0001, "num_tokens": 7942258.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 410.375, "completions/mean_terminated_length": 410.375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.6010972835541282, "frac_reward_zero_std": 1.0, "grad_norm": 0.03620634953569075, "kl": 0.002498626708984375, "learning_rate": 4.707545976183198e-07, "loss": 0.0001, "num_tokens": 7946953.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 337.25, "completions/mean_terminated_length": 337.25, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6013649136892815, "frac_reward_zero_std": 1.0, "grad_norm": 0.011662295060030718, "kl": 0.0010929107666015625, "learning_rate": 4.703408281217808e-07, "loss": 0.0, "num_tokens": 7950755.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.6016325438244347, "frac_reward_zero_std": 1.0, "grad_norm": 0.015110866661309412, "kl": 0.001415252685546875, "learning_rate": 4.699271281407601e-07, "loss": 0.0001, "num_tokens": 7954513.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 377.375, "completions/mean_terminated_length": 377.375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.6019001739595878, "frac_reward_zero_std": 1.0, "grad_norm": 0.02777097000826701, "kl": 0.00186920166015625, "learning_rate": 4.6951349803627817e-07, "loss": 0.0001, "num_tokens": 7958812.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 304.0, "completions/mean_terminated_length": 304.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.602167804094741, "frac_reward_zero_std": 0.5, "grad_norm": 1.0916705074356847, "kl": 0.00341033935546875, "learning_rate": 4.690999381692943e-07, "loss": 0.0137, "num_tokens": 7962252.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 545.25, "completions/mean_terminated_length": 545.25, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "epoch": 0.6024354342298943, "frac_reward_zero_std": 1.0, "grad_norm": 0.019130081077059928, "kl": 0.0018768310546875, "learning_rate": 4.6868644890070596e-07, "loss": 0.0001, "num_tokens": 7967774.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 322.375, "completions/mean_terminated_length": 322.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.6027030643650475, "frac_reward_zero_std": 1.0, "grad_norm": 0.025315119472708165, "kl": 0.002498626708984375, "learning_rate": 4.6827303059135005e-07, "loss": 0.0001, "num_tokens": 7971537.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6029706945002007, "frac_reward_zero_std": 1.0, "grad_norm": 0.01904324886638613, "kl": 0.0013885498046875, "learning_rate": 4.678596836020002e-07, "loss": 0.0001, "num_tokens": 7974630.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 362.625, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.603238324635354, "frac_reward_zero_std": 1.0, "grad_norm": 0.012482711245037031, "kl": 0.0008716583251953125, "learning_rate": 4.6744640829336935e-07, "loss": 0.0, "num_tokens": 7978635.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 453.625, "completions/mean_terminated_length": 372.14288330078125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6035059547705072, "frac_reward_zero_std": 0.5, "grad_norm": 0.7262980593684275, "kl": 0.002086639404296875, "learning_rate": 4.6703320502610646e-07, "loss": 0.0977, "num_tokens": 7983464.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 360.625, "completions/mean_terminated_length": 360.625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.6037735849056604, "frac_reward_zero_std": 1.0, "grad_norm": 0.021196575064632454, "kl": 0.0028228759765625, "learning_rate": 4.6662007416079865e-07, "loss": 0.0001, "num_tokens": 7987549.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 231.625, "completions/mean_terminated_length": 231.625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6040412150408137, "frac_reward_zero_std": 1.0, "grad_norm": 0.014105642372383825, "kl": 0.001346588134765625, "learning_rate": 4.662070160579692e-07, "loss": 0.0001, "num_tokens": 7990366.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 412.75, "completions/mean_terminated_length": 412.75, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "epoch": 0.6043088451759668, "frac_reward_zero_std": 1.0, "grad_norm": 0.01810866038428006, "kl": 0.002349853515625, "learning_rate": 4.657940310780781e-07, "loss": 0.0001, "num_tokens": 7994904.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 355.0, "completions/mean_terminated_length": 355.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.60457647531112, "frac_reward_zero_std": 0.5, "grad_norm": 0.6912828531615589, "kl": 0.001251220703125, "learning_rate": 4.6538111958152193e-07, "loss": 0.2102, "num_tokens": 7998740.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6048441054462732, "frac_reward_zero_std": 1.0, "grad_norm": 0.024436694288016713, "kl": 0.00144195556640625, "learning_rate": 4.649682819286325e-07, "loss": 0.0001, "num_tokens": 8001802.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 244.0, "completions/mean_terminated_length": 244.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6051117355814265, "frac_reward_zero_std": 1.0, "grad_norm": 0.010089706838935194, "kl": 0.0006542205810546875, "learning_rate": 4.6455551847967753e-07, "loss": 0.0, "num_tokens": 8004754.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 301.625, "completions/mean_terminated_length": 301.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.6053793657165797, "frac_reward_zero_std": 1.0, "grad_norm": 0.019198884998013057, "kl": 0.0018463134765625, "learning_rate": 4.641428295948601e-07, "loss": 0.0001, "num_tokens": 8008375.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 384.5, "completions/mean_terminated_length": 384.5, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6056469958517329, "frac_reward_zero_std": 1.0, "grad_norm": 0.012855675518065447, "kl": 0.001041412353515625, "learning_rate": 4.637302156343178e-07, "loss": 0.0, "num_tokens": 8012627.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 199.5, "completions/mean_terminated_length": 199.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6059146259868862, "frac_reward_zero_std": 1.0, "grad_norm": 0.01359308213208079, "kl": 0.0010223388671875, "learning_rate": 4.6331767695812305e-07, "loss": 0.0, "num_tokens": 8015131.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6061822561220394, "frac_reward_zero_std": 1.0, "grad_norm": 0.01873718476112333, "kl": 0.002246856689453125, "learning_rate": 4.629052139262829e-07, "loss": 0.0001, "num_tokens": 8019002.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.6064498862571925, "frac_reward_zero_std": 1.0, "grad_norm": 0.018267674283284723, "kl": 0.002227783203125, "learning_rate": 4.624928268987378e-07, "loss": 0.0001, "num_tokens": 8022511.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 332.125, "completions/mean_terminated_length": 332.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6067175163923457, "frac_reward_zero_std": 0.5, "grad_norm": 0.5907546369545997, "kl": 0.001888275146484375, "learning_rate": 4.620805162353625e-07, "loss": 0.0205, "num_tokens": 8026200.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 322.625, "completions/mean_terminated_length": 322.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.606985146527499, "frac_reward_zero_std": 1.0, "grad_norm": 0.019234468780792706, "kl": 0.001804351806640625, "learning_rate": 4.616682822959643e-07, "loss": 0.0001, "num_tokens": 8029801.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 375.125, "completions/mean_terminated_length": 375.125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6072527766626522, "frac_reward_zero_std": 1.0, "grad_norm": 0.018729189798596443, "kl": 0.00246429443359375, "learning_rate": 4.6125612544028435e-07, "loss": 0.0001, "num_tokens": 8033838.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.6075204067978054, "frac_reward_zero_std": 1.0, "grad_norm": 0.02096376632706723, "kl": 0.00177764892578125, "learning_rate": 4.608440460279963e-07, "loss": 0.0001, "num_tokens": 8037470.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 384.625, "completions/mean_terminated_length": 384.625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6077880369329587, "frac_reward_zero_std": 1.0, "grad_norm": 0.011788577461207221, "kl": 0.0008373260498046875, "learning_rate": 4.6043204441870575e-07, "loss": 0.0, "num_tokens": 8041899.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6080556670681119, "frac_reward_zero_std": 1.0, "grad_norm": 0.08334551234331118, "kl": 0.0095367431640625, "learning_rate": 4.600201209719513e-07, "loss": 0.0004, "num_tokens": 8045226.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 296.25, "completions/mean_terminated_length": 296.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6083232972032651, "frac_reward_zero_std": 1.0, "grad_norm": 0.027298646658991326, "kl": 0.0024261474609375, "learning_rate": 4.596082760472022e-07, "loss": 0.0001, "num_tokens": 8048692.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 335.5, "completions/mean_terminated_length": 335.5, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.6085909273384184, "frac_reward_zero_std": 1.0, "grad_norm": 0.021673096123281742, "kl": 0.00160980224609375, "learning_rate": 4.591965100038604e-07, "loss": 0.0001, "num_tokens": 8052576.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6088585574735715, "frac_reward_zero_std": 0.5, "grad_norm": 1.0673094182260048, "kl": 0.0019989013671875, "learning_rate": 4.587848232012579e-07, "loss": 0.0001, "num_tokens": 8055665.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 284.875, "completions/mean_terminated_length": 284.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.6091261876087247, "frac_reward_zero_std": 1.0, "grad_norm": 0.025533200280900375, "kl": 0.0024261474609375, "learning_rate": 4.583732159986584e-07, "loss": 0.0001, "num_tokens": 8059180.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6093938177438779, "frac_reward_zero_std": 0.5, "grad_norm": 0.9390883291415977, "kl": 0.00211334228515625, "learning_rate": 4.5796168875525554e-07, "loss": 0.0172, "num_tokens": 8063137.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.6096614478790312, "frac_reward_zero_std": 1.0, "grad_norm": 0.01445086082083542, "kl": 0.00171661376953125, "learning_rate": 4.5755024183017354e-07, "loss": 0.0001, "num_tokens": 8066657.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 323.375, "completions/mean_terminated_length": 323.375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6099290780141844, "frac_reward_zero_std": 1.0, "grad_norm": 0.04403105415463255, "kl": 0.0020904541015625, "learning_rate": 4.571388755824667e-07, "loss": 0.0001, "num_tokens": 8070308.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6101967081493376, "frac_reward_zero_std": 1.0, "grad_norm": 0.01895259593459498, "kl": 0.001850128173828125, "learning_rate": 4.567275903711182e-07, "loss": 0.0001, "num_tokens": 8073349.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.6104643382844909, "frac_reward_zero_std": 1.0, "grad_norm": 0.01083206752915104, "kl": 0.001003265380859375, "learning_rate": 4.5631638655504123e-07, "loss": 0.0, "num_tokens": 8076619.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 277.125, "completions/mean_terminated_length": 277.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6107319684196441, "frac_reward_zero_std": 1.0, "grad_norm": 0.0174396638621502, "kl": 0.00147247314453125, "learning_rate": 4.5590526449307745e-07, "loss": 0.0001, "num_tokens": 8079836.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6109995985547972, "frac_reward_zero_std": 1.0, "grad_norm": 0.030809575245550926, "kl": 0.00191497802734375, "learning_rate": 4.554942245439977e-07, "loss": 0.0001, "num_tokens": 8082769.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6112672286899505, "frac_reward_zero_std": 1.0, "grad_norm": 0.01315874094494349, "kl": 0.001056671142578125, "learning_rate": 4.5508326706650036e-07, "loss": 0.0, "num_tokens": 8085951.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 342.125, "completions/mean_terminated_length": 342.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6115348588251037, "frac_reward_zero_std": 0.5, "grad_norm": 0.664908655419682, "kl": 0.00186920166015625, "learning_rate": 4.5467239241921294e-07, "loss": 0.0219, "num_tokens": 8089764.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 267.625, "completions/mean_terminated_length": 267.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6118024889602569, "frac_reward_zero_std": 1.0, "grad_norm": 0.014695421652797415, "kl": 0.00142669677734375, "learning_rate": 4.5426160096068955e-07, "loss": 0.0001, "num_tokens": 8093021.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.6120701190954101, "frac_reward_zero_std": 1.0, "grad_norm": 0.022814543025281808, "kl": 0.0019073486328125, "learning_rate": 4.5385089304941237e-07, "loss": 0.0001, "num_tokens": 8096197.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 348.5, "completions/mean_terminated_length": 348.5, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.6123377492305634, "frac_reward_zero_std": 1.0, "grad_norm": 0.009419423458531476, "kl": 0.00080108642578125, "learning_rate": 4.5344026904379085e-07, "loss": 0.0, "num_tokens": 8100113.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6126053793657166, "frac_reward_zero_std": 1.0, "grad_norm": 0.015028866243469488, "kl": 0.001575469970703125, "learning_rate": 4.530297293021603e-07, "loss": 0.0001, "num_tokens": 8103417.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6128730095008698, "frac_reward_zero_std": 1.0, "grad_norm": 0.018674210565430036, "kl": 0.001453399658203125, "learning_rate": 4.526192741827837e-07, "loss": 0.0001, "num_tokens": 8106994.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 274.75, "completions/mean_terminated_length": 274.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.613140639636023, "frac_reward_zero_std": 1.0, "grad_norm": 0.014884584779296146, "kl": 0.001186370849609375, "learning_rate": 4.5220890404384903e-07, "loss": 0.0, "num_tokens": 8110124.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 252.0, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6134082697711762, "frac_reward_zero_std": 1.0, "grad_norm": 0.02170911985342445, "kl": 0.00193023681640625, "learning_rate": 4.5179861924347097e-07, "loss": 0.0001, "num_tokens": 8113368.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.6136758999063294, "frac_reward_zero_std": 1.0, "grad_norm": 0.014922550541474233, "kl": 0.001308441162109375, "learning_rate": 4.5138842013968913e-07, "loss": 0.0001, "num_tokens": 8116664.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 252.75, "completions/mean_terminated_length": 252.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6139435300414827, "frac_reward_zero_std": 1.0, "grad_norm": 0.018957481260563416, "kl": 0.0010833740234375, "learning_rate": 4.5097830709046903e-07, "loss": 0.0, "num_tokens": 8119738.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 422.5, "completions/mean_terminated_length": 422.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6142111601766359, "frac_reward_zero_std": 0.5, "grad_norm": 0.6721474575518686, "kl": 0.002655029296875, "learning_rate": 4.505682804537001e-07, "loss": 0.0797, "num_tokens": 8124242.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6144787903117891, "frac_reward_zero_std": 1.0, "grad_norm": 0.011387224723231388, "kl": 0.0008869171142578125, "learning_rate": 4.501583405871975e-07, "loss": 0.0, "num_tokens": 8128055.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 339.125, "completions/mean_terminated_length": 339.125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6147464204469423, "frac_reward_zero_std": 0.0, "grad_norm": 0.9609397657103833, "kl": 0.00177764892578125, "learning_rate": 4.497484878486998e-07, "loss": -0.0152, "num_tokens": 8131796.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 222.875, "completions/mean_terminated_length": 222.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6150140505820956, "frac_reward_zero_std": 1.0, "grad_norm": 0.014675109216450972, "kl": 0.001331329345703125, "learning_rate": 4.493387225958698e-07, "loss": 0.0001, "num_tokens": 8134667.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 331.5, "completions/mean_terminated_length": 331.5, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.6152816807172488, "frac_reward_zero_std": 0.5, "grad_norm": 0.7692349458101823, "kl": 0.001598358154296875, "learning_rate": 4.4892904518629427e-07, "loss": 0.0285, "num_tokens": 8138371.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 303.125, "completions/mean_terminated_length": 303.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6155493108524019, "frac_reward_zero_std": 0.5, "grad_norm": 0.8344773971514498, "kl": 0.001667022705078125, "learning_rate": 4.485194559774826e-07, "loss": 0.0526, "num_tokens": 8141652.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 289.875, "completions/mean_terminated_length": 289.875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.6158169409875552, "frac_reward_zero_std": 0.5, "grad_norm": 0.6855423829924466, "kl": 0.0005092620849609375, "learning_rate": 4.481099553268682e-07, "loss": 0.067, "num_tokens": 8144823.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.6160845711227084, "frac_reward_zero_std": 1.0, "grad_norm": 0.02531007244594831, "kl": 0.00213623046875, "learning_rate": 4.4770054359180623e-07, "loss": 0.0001, "num_tokens": 8147895.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6163522012578616, "frac_reward_zero_std": 1.0, "grad_norm": 0.022171634686916183, "kl": 0.00244903564453125, "learning_rate": 4.47291221129575e-07, "loss": 0.0001, "num_tokens": 8151448.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 376.125, "completions/mean_terminated_length": 376.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.6166198313930149, "frac_reward_zero_std": 0.5, "grad_norm": 1.0666024014337316, "kl": 0.00228118896484375, "learning_rate": 4.468819882973743e-07, "loss": 0.0717, "num_tokens": 8155669.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 315.0, "completions/mean_terminated_length": 315.0, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6168874615281681, "frac_reward_zero_std": 1.0, "grad_norm": 0.00984721840247487, "kl": 0.000732421875, "learning_rate": 4.4647284545232623e-07, "loss": 0.0, "num_tokens": 8159165.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 257.25, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6171550916633213, "frac_reward_zero_std": 0.5, "grad_norm": 0.6680133150190573, "kl": 0.002414703369140625, "learning_rate": 4.460637929514738e-07, "loss": -0.0084, "num_tokens": 8162227.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6174227217984745, "frac_reward_zero_std": 1.0, "grad_norm": 0.017498105478012276, "kl": 0.001377105712890625, "learning_rate": 4.456548311517818e-07, "loss": 0.0001, "num_tokens": 8165150.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 396.375, "completions/mean_terminated_length": 396.375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.6176903519336278, "frac_reward_zero_std": 1.0, "grad_norm": 0.011441921465899568, "kl": 0.00095367431640625, "learning_rate": 4.4524596041013574e-07, "loss": 0.0, "num_tokens": 8169489.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 261.625, "completions/mean_terminated_length": 261.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.6179579820687809, "frac_reward_zero_std": 1.0, "grad_norm": 0.010917474925055905, "kl": 0.000896453857421875, "learning_rate": 4.448371810833412e-07, "loss": 0.0, "num_tokens": 8172566.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6182256122039341, "frac_reward_zero_std": 1.0, "grad_norm": 0.01946981824044803, "kl": 0.0015850067138671875, "learning_rate": 4.444284935281245e-07, "loss": 0.0001, "num_tokens": 8175855.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 321.125, "completions/mean_terminated_length": 321.125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6184932423390874, "frac_reward_zero_std": 1.0, "grad_norm": 0.03525257629719334, "kl": 0.001922607421875, "learning_rate": 4.4401989810113126e-07, "loss": 0.0001, "num_tokens": 8179408.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6187608724742406, "frac_reward_zero_std": 1.0, "grad_norm": 0.019404565124382416, "kl": 0.001216888427734375, "learning_rate": 4.4361139515892775e-07, "loss": 0.0, "num_tokens": 8182486.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 297.125, "completions/mean_terminated_length": 297.125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6190285026093938, "frac_reward_zero_std": 1.0, "grad_norm": 0.03401859047537412, "kl": 0.001331329345703125, "learning_rate": 4.432029850579982e-07, "loss": 0.0001, "num_tokens": 8185963.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 216.625, "completions/mean_terminated_length": 216.625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6192961327445471, "frac_reward_zero_std": 1.0, "grad_norm": 0.01890161282939713, "kl": 0.001476287841796875, "learning_rate": 4.427946681547473e-07, "loss": 0.0001, "num_tokens": 8188832.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 315.375, "completions/mean_terminated_length": 315.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.6195637628797003, "frac_reward_zero_std": 0.5, "grad_norm": 0.6395837066424254, "kl": 0.0014190673828125, "learning_rate": 4.4238644480549694e-07, "loss": 0.0126, "num_tokens": 8192491.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 349.125, "completions/mean_terminated_length": 349.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6198313930148535, "frac_reward_zero_std": 1.0, "grad_norm": 0.015822087903938614, "kl": 0.00138092041015625, "learning_rate": 4.4197831536648847e-07, "loss": 0.0001, "num_tokens": 8196468.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6200990231500066, "frac_reward_zero_std": 1.0, "grad_norm": 0.026183491162826263, "kl": 0.0020904541015625, "learning_rate": 4.4157028019388045e-07, "loss": 0.0001, "num_tokens": 8199667.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 317.0, "completions/mean_terminated_length": 317.0, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.6203666532851599, "frac_reward_zero_std": 1.0, "grad_norm": 0.019445472286829576, "kl": 0.00206756591796875, "learning_rate": 4.4116233964374994e-07, "loss": 0.0001, "num_tokens": 8203231.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 839.0, "completions/max_terminated_length": 839.0, "completions/mean_length": 497.375, "completions/mean_terminated_length": 497.375, "completions/min_length": 419.0, "completions/min_terminated_length": 419.0, "epoch": 0.6206342834203131, "frac_reward_zero_std": 1.0, "grad_norm": 0.015723702384170826, "kl": 0.0017242431640625, "learning_rate": 4.407544940720912e-07, "loss": 0.0001, "num_tokens": 8208246.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 378.625, "completions/mean_terminated_length": 378.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6209019135554663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0128926532207923, "kl": 0.001377105712890625, "learning_rate": 4.4034674383481517e-07, "loss": 0.0001, "num_tokens": 8212551.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6211695436906196, "frac_reward_zero_std": 1.0, "grad_norm": 0.022138349951583897, "kl": 0.001720428466796875, "learning_rate": 4.3993908928775005e-07, "loss": 0.0001, "num_tokens": 8215920.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.6214371738257728, "frac_reward_zero_std": 1.0, "grad_norm": 0.015076836558611148, "kl": 0.00115203857421875, "learning_rate": 4.395315307866404e-07, "loss": 0.0, "num_tokens": 8220017.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 501.5, "completions/mean_terminated_length": 426.8571472167969, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.621704803960926, "frac_reward_zero_std": 0.5, "grad_norm": 0.4789310257141925, "kl": 0.00164031982421875, "learning_rate": 4.39124068687147e-07, "loss": 0.0717, "num_tokens": 8225365.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.6219724340960792, "frac_reward_zero_std": 1.0, "grad_norm": 0.015991531038779032, "kl": 0.0015411376953125, "learning_rate": 4.3871670334484633e-07, "loss": 0.0001, "num_tokens": 8227936.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 274.375, "completions/mean_terminated_length": 274.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.6222400642312325, "frac_reward_zero_std": 1.0, "grad_norm": 0.014506809933050124, "kl": 0.00118255615234375, "learning_rate": 4.3830943511523087e-07, "loss": 0.0, "num_tokens": 8231151.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 291.5, "completions/mean_terminated_length": 291.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6225076943663856, "frac_reward_zero_std": 1.0, "grad_norm": 0.03279059262815035, "kl": 0.001953125, "learning_rate": 4.379022643537077e-07, "loss": 0.0001, "num_tokens": 8234555.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.6227753245015388, "frac_reward_zero_std": 1.0, "grad_norm": 0.04509995093062883, "kl": 0.0019378662109375, "learning_rate": 4.3749519141559943e-07, "loss": 0.0001, "num_tokens": 8238063.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6230429546366921, "frac_reward_zero_std": 1.0, "grad_norm": 0.026666829479541458, "kl": 0.001293182373046875, "learning_rate": 4.3708821665614316e-07, "loss": 0.0001, "num_tokens": 8240915.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.6233105847718453, "frac_reward_zero_std": 1.0, "grad_norm": 0.012553010213531146, "kl": 0.001438140869140625, "learning_rate": 4.366813404304899e-07, "loss": 0.0001, "num_tokens": 8244383.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 387.875, "completions/mean_terminated_length": 387.875, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.6235782149069985, "frac_reward_zero_std": 1.0, "grad_norm": 0.00878437766115824, "kl": 0.0007114410400390625, "learning_rate": 4.3627456309370525e-07, "loss": 0.0, "num_tokens": 8248514.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6238458450421518, "frac_reward_zero_std": 1.0, "grad_norm": 0.016181310539087525, "kl": 0.0013275146484375, "learning_rate": 4.358678850007681e-07, "loss": 0.0001, "num_tokens": 8251627.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.624113475177305, "frac_reward_zero_std": 0.5, "grad_norm": 0.7177928144218677, "kl": 0.0034332275390625, "learning_rate": 4.354613065065711e-07, "loss": 0.0803, "num_tokens": 8256011.0, "reward": 0.125, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 401.75, "completions/mean_terminated_length": 401.75, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6243811053124582, "frac_reward_zero_std": 1.0, "grad_norm": 0.009546109359489659, "kl": 0.0008544921875, "learning_rate": 4.3505482796591964e-07, "loss": 0.0, "num_tokens": 8260445.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 320.125, "completions/mean_terminated_length": 320.125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.6246487354476113, "frac_reward_zero_std": 1.0, "grad_norm": 0.02995943682944174, "kl": 0.00319671630859375, "learning_rate": 4.346484497335321e-07, "loss": 0.0001, "num_tokens": 8264290.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 293.75, "completions/mean_terminated_length": 293.75, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6249163655827646, "frac_reward_zero_std": 1.0, "grad_norm": 0.009568059728929027, "kl": 0.0005817413330078125, "learning_rate": 4.3424217216403915e-07, "loss": 0.0, "num_tokens": 8267636.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 217.875, "completions/mean_terminated_length": 217.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6251839957179178, "frac_reward_zero_std": 1.0, "grad_norm": 0.04411935465321217, "kl": 0.001819610595703125, "learning_rate": 4.338359956119836e-07, "loss": 0.0001, "num_tokens": 8270255.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 375.625, "completions/mean_terminated_length": 375.625, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.625451625853071, "frac_reward_zero_std": 0.5, "grad_norm": 0.9790515875050315, "kl": 0.001861572265625, "learning_rate": 4.334299204318208e-07, "loss": -0.015, "num_tokens": 8274608.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 293.375, "completions/mean_terminated_length": 293.375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6257192559882243, "frac_reward_zero_std": 1.0, "grad_norm": 0.013076582588801827, "kl": 0.0012798309326171875, "learning_rate": 4.3302394697791644e-07, "loss": 0.0001, "num_tokens": 8277843.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 285.125, "completions/mean_terminated_length": 285.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6259868861233775, "frac_reward_zero_std": 1.0, "grad_norm": 0.02509588335409341, "kl": 0.00147247314453125, "learning_rate": 4.3261807560454835e-07, "loss": 0.0001, "num_tokens": 8281164.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 358.875, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6262545162585307, "frac_reward_zero_std": 1.0, "grad_norm": 0.013269909546734801, "kl": 0.00109100341796875, "learning_rate": 4.322123066659048e-07, "loss": 0.0, "num_tokens": 8285179.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 319.625, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.626522146393684, "frac_reward_zero_std": 1.0, "grad_norm": 0.02068724981556691, "kl": 0.00214385986328125, "learning_rate": 4.3180664051608504e-07, "loss": 0.0001, "num_tokens": 8288788.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 415.0, "completions/mean_terminated_length": 415.0, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6267897765288372, "frac_reward_zero_std": 0.5, "grad_norm": 0.48757166722782463, "kl": 0.0018157958984375, "learning_rate": 4.3140107750909825e-07, "loss": 0.0222, "num_tokens": 8293252.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 366.125, "completions/mean_terminated_length": 366.125, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6270574066639903, "frac_reward_zero_std": 1.0, "grad_norm": 0.013296749366112514, "kl": 0.0010833740234375, "learning_rate": 4.309956179988641e-07, "loss": 0.0, "num_tokens": 8297369.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 537.375, "completions/mean_terminated_length": 537.375, "completions/min_length": 426.0, "completions/min_terminated_length": 426.0, "epoch": 0.6273250367991435, "frac_reward_zero_std": 0.5, "grad_norm": 0.9124308074752031, "kl": 0.00151824951171875, "learning_rate": 4.305902623392113e-07, "loss": 0.0731, "num_tokens": 8303496.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 348.125, "completions/mean_terminated_length": 348.125, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.6275926669342968, "frac_reward_zero_std": 1.0, "grad_norm": 0.014650953937130224, "kl": 0.00084686279296875, "learning_rate": 4.3018501088387837e-07, "loss": 0.0, "num_tokens": 8307353.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 389.75, "completions/mean_terminated_length": 389.75, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.62786029706945, "frac_reward_zero_std": 1.0, "grad_norm": 0.01830519866823192, "kl": 0.00116729736328125, "learning_rate": 4.2977986398651276e-07, "loss": 0.0, "num_tokens": 8311731.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 232.0, "completions/mean_terminated_length": 232.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6281279272046032, "frac_reward_zero_std": 1.0, "grad_norm": 0.011868636645785583, "kl": 0.00084686279296875, "learning_rate": 4.2937482200067075e-07, "loss": 0.0, "num_tokens": 8314559.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 325.75, "completions/mean_terminated_length": 325.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.6283955573397565, "frac_reward_zero_std": 1.0, "grad_norm": 0.028289225546242944, "kl": 0.00208282470703125, "learning_rate": 4.2896988527981714e-07, "loss": 0.0001, "num_tokens": 8318377.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6286631874749097, "frac_reward_zero_std": 1.0, "grad_norm": 0.030009923285722204, "kl": 0.00152587890625, "learning_rate": 4.2856505417732437e-07, "loss": 0.0001, "num_tokens": 8321446.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 328.25, "completions/mean_terminated_length": 328.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.6289308176100629, "frac_reward_zero_std": 1.0, "grad_norm": 0.010774931314366045, "kl": 0.0009613037109375, "learning_rate": 4.281603290464736e-07, "loss": 0.0, "num_tokens": 8325296.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 420.0, "completions/mean_terminated_length": 420.0, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.6291984477452162, "frac_reward_zero_std": 1.0, "grad_norm": 0.02417219337105996, "kl": 0.0014629364013671875, "learning_rate": 4.2775571024045266e-07, "loss": 0.0001, "num_tokens": 8329760.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 254.125, "completions/mean_terminated_length": 254.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.6294660778803693, "frac_reward_zero_std": 1.0, "grad_norm": 0.02414304132162754, "kl": 0.001605987548828125, "learning_rate": 4.27351198112357e-07, "loss": 0.0001, "num_tokens": 8332761.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.6297337080155225, "frac_reward_zero_std": 1.0, "grad_norm": 0.019403363876612935, "kl": 0.00217437744140625, "learning_rate": 4.269467930151888e-07, "loss": 0.0001, "num_tokens": 8336188.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 287.375, "completions/mean_terminated_length": 287.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.6300013381506757, "frac_reward_zero_std": 1.0, "grad_norm": 0.012550223144875163, "kl": 0.001338958740234375, "learning_rate": 4.2654249530185724e-07, "loss": 0.0001, "num_tokens": 8339551.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 430.625, "completions/mean_terminated_length": 430.625, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 0.630268968285829, "frac_reward_zero_std": 0.5, "grad_norm": 0.576360837555537, "kl": 0.00157928466796875, "learning_rate": 4.2613830532517724e-07, "loss": 0.0014, "num_tokens": 8344332.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6305365984209822, "frac_reward_zero_std": 1.0, "grad_norm": 0.021637033838319688, "kl": 0.0015869140625, "learning_rate": 4.2573422343787004e-07, "loss": 0.0001, "num_tokens": 8348055.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.6308042285561354, "frac_reward_zero_std": 1.0, "grad_norm": 0.011721342112959013, "kl": 0.0008983612060546875, "learning_rate": 4.253302499925626e-07, "loss": 0.0, "num_tokens": 8351183.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 257.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6310718586912887, "frac_reward_zero_std": 1.0, "grad_norm": 0.042585538700924706, "kl": 0.00279998779296875, "learning_rate": 4.2492638534178695e-07, "loss": 0.0001, "num_tokens": 8354233.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6313394888264419, "frac_reward_zero_std": 1.0, "grad_norm": 0.014481779578960166, "kl": 0.001331329345703125, "learning_rate": 4.245226298379806e-07, "loss": 0.0001, "num_tokens": 8357556.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 410.625, "completions/mean_terminated_length": 410.625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.631607118961595, "frac_reward_zero_std": 1.0, "grad_norm": 0.02164149289584404, "kl": 0.00159454345703125, "learning_rate": 4.241189838334851e-07, "loss": 0.0001, "num_tokens": 8362065.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.6318747490967483, "frac_reward_zero_std": 1.0, "grad_norm": 0.0054369392700651855, "kl": 0.00022649765014648438, "learning_rate": 4.2371544768054755e-07, "loss": 0.0, "num_tokens": 8365492.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6321423792319015, "frac_reward_zero_std": 1.0, "grad_norm": 0.011096643743424886, "kl": 0.001251220703125, "learning_rate": 4.2331202173131795e-07, "loss": 0.0001, "num_tokens": 8368634.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 631.0, "completions/mean_length": 484.625, "completions/mean_terminated_length": 407.5714416503906, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.6324100093670547, "frac_reward_zero_std": 0.5, "grad_norm": 0.5550437686885277, "kl": 0.00140380859375, "learning_rate": 4.2290870633785125e-07, "loss": 0.0914, "num_tokens": 8373723.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6326776395022079, "frac_reward_zero_std": 1.0, "grad_norm": 0.12762189161987103, "kl": 0.002918243408203125, "learning_rate": 4.2250550185210474e-07, "loss": 0.0001, "num_tokens": 8377397.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 401.125, "completions/mean_terminated_length": 401.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.6329452696373612, "frac_reward_zero_std": 1.0, "grad_norm": 0.013536625614524006, "kl": 0.001171112060546875, "learning_rate": 4.2210240862594036e-07, "loss": 0.0, "num_tokens": 8381954.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.6332128997725144, "frac_reward_zero_std": 0.5, "grad_norm": 1.316647559987911, "kl": 0.00189208984375, "learning_rate": 4.216994270111215e-07, "loss": -0.0134, "num_tokens": 8385562.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 390.75, "completions/mean_terminated_length": 390.75, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.6334805299076676, "frac_reward_zero_std": 1.0, "grad_norm": 0.034475982495844285, "kl": 0.0032196044921875, "learning_rate": 4.2129655735931513e-07, "loss": 0.0001, "num_tokens": 8389880.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 414.75, "completions/mean_terminated_length": 414.75, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6337481600428209, "frac_reward_zero_std": 1.0, "grad_norm": 0.011975116469827497, "kl": 0.00115203857421875, "learning_rate": 4.208938000220904e-07, "loss": 0.0, "num_tokens": 8394242.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 202.375, "completions/mean_terminated_length": 202.375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.634015790177974, "frac_reward_zero_std": 1.0, "grad_norm": 0.019692741840345945, "kl": 0.00246429443359375, "learning_rate": 4.2049115535091795e-07, "loss": 0.0001, "num_tokens": 8396777.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 340.625, "completions/mean_terminated_length": 340.625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.6342834203131272, "frac_reward_zero_std": 1.0, "grad_norm": 0.025587988219896607, "kl": 0.002384185791015625, "learning_rate": 4.2008862369717067e-07, "loss": 0.0001, "num_tokens": 8400618.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6345510504482805, "frac_reward_zero_std": 1.0, "grad_norm": 0.02429008601263747, "kl": 0.001827239990234375, "learning_rate": 4.196862054121222e-07, "loss": 0.0001, "num_tokens": 8404059.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.6348186805834337, "frac_reward_zero_std": 1.0, "grad_norm": 0.022053390782004217, "kl": 0.00165557861328125, "learning_rate": 4.192839008469481e-07, "loss": 0.0001, "num_tokens": 8406930.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 268.125, "completions/mean_terminated_length": 268.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6350863107185869, "frac_reward_zero_std": 1.0, "grad_norm": 0.0458199376434333, "kl": 0.00208282470703125, "learning_rate": 4.188817103527238e-07, "loss": 0.0001, "num_tokens": 8410015.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6353539408537401, "frac_reward_zero_std": 1.0, "grad_norm": 0.019710548516578188, "kl": 0.0020599365234375, "learning_rate": 4.184796342804259e-07, "loss": 0.0001, "num_tokens": 8413430.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 176.125, "completions/mean_terminated_length": 176.125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6356215709888934, "frac_reward_zero_std": 1.0, "grad_norm": 0.018092580202287437, "kl": 0.0007724761962890625, "learning_rate": 4.1807767298093056e-07, "loss": 0.0, "num_tokens": 8415743.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.6358892011240466, "frac_reward_zero_std": 1.0, "grad_norm": 0.053274626595165554, "kl": 0.00312042236328125, "learning_rate": 4.176758268050141e-07, "loss": 0.0001, "num_tokens": 8418937.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.6361568312591998, "frac_reward_zero_std": 1.0, "grad_norm": 0.019767656021821522, "kl": 0.00173187255859375, "learning_rate": 4.1727409610335264e-07, "loss": 0.0001, "num_tokens": 8423078.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 327.875, "completions/mean_terminated_length": 327.875, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.636424461394353, "frac_reward_zero_std": 1.0, "grad_norm": 0.025502463859166533, "kl": 0.0023193359375, "learning_rate": 4.1687248122652086e-07, "loss": 0.0001, "num_tokens": 8426657.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 353.125, "completions/mean_terminated_length": 353.125, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6366920915295062, "frac_reward_zero_std": 1.0, "grad_norm": 0.027291807661736684, "kl": 0.002044677734375, "learning_rate": 4.164709825249931e-07, "loss": 0.0001, "num_tokens": 8430494.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.6369597216646594, "frac_reward_zero_std": 0.5, "grad_norm": 0.991544672970967, "kl": 0.0012493133544921875, "learning_rate": 4.160696003491416e-07, "loss": -0.0042, "num_tokens": 8433939.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 405.375, "completions/mean_terminated_length": 405.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6372273517998127, "frac_reward_zero_std": 1.0, "grad_norm": 0.01992378221186392, "kl": 0.00164794921875, "learning_rate": 4.156683350492376e-07, "loss": 0.0001, "num_tokens": 8438262.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.6374949819349659, "frac_reward_zero_std": 1.0, "grad_norm": 0.0176478198139949, "kl": 0.00179290771484375, "learning_rate": 4.152671869754496e-07, "loss": 0.0001, "num_tokens": 8441841.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6377626120701191, "frac_reward_zero_std": 1.0, "grad_norm": 0.07855202386744695, "kl": 0.0040130615234375, "learning_rate": 4.148661564778445e-07, "loss": 0.0002, "num_tokens": 8444803.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6380302422052723, "frac_reward_zero_std": 1.0, "grad_norm": 0.025225116638745007, "kl": 0.001407623291015625, "learning_rate": 4.1446524390638617e-07, "loss": 0.0001, "num_tokens": 8447996.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.6382978723404256, "frac_reward_zero_std": 1.0, "grad_norm": 0.01579838216374743, "kl": 0.0016841888427734375, "learning_rate": 4.140644496109358e-07, "loss": 0.0001, "num_tokens": 8451462.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6385655024755788, "frac_reward_zero_std": 1.0, "grad_norm": 0.03227346736516413, "kl": 0.001613616943359375, "learning_rate": 4.1366377394125127e-07, "loss": 0.0001, "num_tokens": 8454836.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6388331326107319, "frac_reward_zero_std": 1.0, "grad_norm": 0.019177848889144988, "kl": 0.00148773193359375, "learning_rate": 4.1326321724698687e-07, "loss": 0.0001, "num_tokens": 8458228.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 402.125, "completions/mean_terminated_length": 402.125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.6391007627458852, "frac_reward_zero_std": 1.0, "grad_norm": 0.021685755810832964, "kl": 0.001705169677734375, "learning_rate": 4.1286277987769334e-07, "loss": 0.0001, "num_tokens": 8462801.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6393683928810384, "frac_reward_zero_std": 1.0, "grad_norm": 0.02449731664916393, "kl": 0.00211334228515625, "learning_rate": 4.1246246218281667e-07, "loss": 0.0001, "num_tokens": 8466216.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 399.0, "completions/mean_terminated_length": 399.0, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.6396360230161916, "frac_reward_zero_std": 0.5, "grad_norm": 0.6947473771061886, "kl": 0.00185394287109375, "learning_rate": 4.1206226451169933e-07, "loss": -0.0093, "num_tokens": 8470444.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 290.25, "completions/mean_terminated_length": 290.25, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6399036531513448, "frac_reward_zero_std": 1.0, "grad_norm": 0.012131768067242647, "kl": 0.001361846923828125, "learning_rate": 4.1166218721357826e-07, "loss": 0.0001, "num_tokens": 8473846.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 433.875, "completions/mean_terminated_length": 433.875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6401712832864981, "frac_reward_zero_std": 1.0, "grad_norm": 0.018276206508361134, "kl": 0.00148773193359375, "learning_rate": 4.112622306375857e-07, "loss": 0.0001, "num_tokens": 8478465.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6404389134216513, "frac_reward_zero_std": 1.0, "grad_norm": 0.029427547925715145, "kl": 0.0023345947265625, "learning_rate": 4.1086239513274845e-07, "loss": 0.0001, "num_tokens": 8481550.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.6407065435568045, "frac_reward_zero_std": 1.0, "grad_norm": 0.009953542996912413, "kl": 0.00119781494140625, "learning_rate": 4.1046268104798777e-07, "loss": 0.0, "num_tokens": 8485564.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 284.25, "completions/mean_terminated_length": 284.25, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6409741736919577, "frac_reward_zero_std": 1.0, "grad_norm": 0.010881976247439964, "kl": 0.0006051063537597656, "learning_rate": 4.100630887321187e-07, "loss": 0.0, "num_tokens": 8488786.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6412418038271109, "frac_reward_zero_std": 1.0, "grad_norm": 0.026883926659104065, "kl": 0.00290679931640625, "learning_rate": 4.0966361853385043e-07, "loss": 0.0001, "num_tokens": 8491879.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 289.0, "completions/mean_terminated_length": 289.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6415094339622641, "frac_reward_zero_std": 1.0, "grad_norm": 0.020010866357584246, "kl": 0.00136566162109375, "learning_rate": 4.0926427080178527e-07, "loss": 0.0001, "num_tokens": 8495135.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 377.125, "completions/mean_terminated_length": 377.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6417770640974174, "frac_reward_zero_std": 1.0, "grad_norm": 0.01824883772850432, "kl": 0.0013275146484375, "learning_rate": 4.088650458844187e-07, "loss": 0.0001, "num_tokens": 8499300.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 249.125, "completions/mean_terminated_length": 249.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6420446942325706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01466361030347474, "kl": 0.00060272216796875, "learning_rate": 4.0846594413013937e-07, "loss": 0.0, "num_tokens": 8502253.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 423.875, "completions/mean_terminated_length": 423.875, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.6423123243677238, "frac_reward_zero_std": 0.5, "grad_norm": 0.5441977573502211, "kl": 0.002162933349609375, "learning_rate": 4.0806696588722744e-07, "loss": 0.0529, "num_tokens": 8506932.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 248.875, "completions/mean_terminated_length": 248.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.642579954502877, "frac_reward_zero_std": 1.0, "grad_norm": 0.012614385963626703, "kl": 0.0010223388671875, "learning_rate": 4.0766811150385673e-07, "loss": 0.0, "num_tokens": 8509863.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 328.75, "completions/mean_terminated_length": 328.75, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.6428475846380303, "frac_reward_zero_std": 1.0, "grad_norm": 0.034528408505113926, "kl": 0.002620697021484375, "learning_rate": 4.0726938132809175e-07, "loss": 0.0001, "num_tokens": 8513685.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.6431152147731835, "frac_reward_zero_std": 1.0, "grad_norm": 0.010490871860091535, "kl": 0.00086212158203125, "learning_rate": 4.068707757078895e-07, "loss": 0.0, "num_tokens": 8517817.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.6433828449083366, "frac_reward_zero_std": 1.0, "grad_norm": 0.02082500237148724, "kl": 0.002086639404296875, "learning_rate": 4.064722949910976e-07, "loss": 0.0001, "num_tokens": 8521328.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6436504750434899, "frac_reward_zero_std": 1.0, "grad_norm": 0.00964152630306386, "kl": 0.0007343292236328125, "learning_rate": 4.0607393952545485e-07, "loss": 0.0, "num_tokens": 8524705.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6439181051786431, "frac_reward_zero_std": 1.0, "grad_norm": 0.012183839030588374, "kl": 0.001163482666015625, "learning_rate": 4.056757096585913e-07, "loss": 0.0, "num_tokens": 8528432.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6441857353137963, "frac_reward_zero_std": 1.0, "grad_norm": 0.0148226818715508, "kl": 0.001094818115234375, "learning_rate": 4.052776057380266e-07, "loss": 0.0, "num_tokens": 8531696.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 559.0, "completions/max_terminated_length": 559.0, "completions/mean_length": 378.125, "completions/mean_terminated_length": 378.125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.6444533654489496, "frac_reward_zero_std": 1.0, "grad_norm": 0.009032430051581794, "kl": 0.0006237030029296875, "learning_rate": 4.0487962811117105e-07, "loss": 0.0, "num_tokens": 8535913.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6447209955841028, "frac_reward_zero_std": 1.0, "grad_norm": 0.039562912488062804, "kl": 0.0027008056640625, "learning_rate": 4.0448177712532426e-07, "loss": 0.0001, "num_tokens": 8539777.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 310.875, "completions/mean_terminated_length": 310.875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.644988625719256, "frac_reward_zero_std": 1.0, "grad_norm": 0.024421091121600486, "kl": 0.002178192138671875, "learning_rate": 4.04084053127676e-07, "loss": 0.0001, "num_tokens": 8543400.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 248.125, "completions/mean_terminated_length": 248.125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.6452562558544092, "frac_reward_zero_std": 1.0, "grad_norm": 0.03413421854682941, "kl": 0.00284576416015625, "learning_rate": 4.036864564653043e-07, "loss": 0.0001, "num_tokens": 8546625.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 201.5, "completions/mean_terminated_length": 201.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6455238859895625, "frac_reward_zero_std": 1.0, "grad_norm": 0.010814878761688582, "kl": 0.0009918212890625, "learning_rate": 4.032889874851771e-07, "loss": 0.0, "num_tokens": 8549313.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.6457915161247156, "frac_reward_zero_std": 1.0, "grad_norm": 0.13316009492430703, "kl": 0.0045013427734375, "learning_rate": 4.028916465341501e-07, "loss": 0.0002, "num_tokens": 8552482.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 314.375, "completions/mean_terminated_length": 314.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.6460591462598688, "frac_reward_zero_std": 1.0, "grad_norm": 0.020782851074320936, "kl": 0.00145721435546875, "learning_rate": 4.0249443395896765e-07, "loss": 0.0001, "num_tokens": 8555965.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 532.0, "completions/max_terminated_length": 532.0, "completions/mean_length": 403.75, "completions/mean_terminated_length": 403.75, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.6463267763950221, "frac_reward_zero_std": 1.0, "grad_norm": 0.021665538277090356, "kl": 0.0024261474609375, "learning_rate": 4.020973501062621e-07, "loss": 0.0001, "num_tokens": 8560483.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 324.375, "completions/mean_terminated_length": 324.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6465944065301753, "frac_reward_zero_std": 1.0, "grad_norm": 0.020523109079094125, "kl": 0.00177001953125, "learning_rate": 4.0170039532255307e-07, "loss": 0.0001, "num_tokens": 8564278.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6468620366653285, "frac_reward_zero_std": 1.0, "grad_norm": 0.02201980093290348, "kl": 0.001697540283203125, "learning_rate": 4.013035699542484e-07, "loss": 0.0001, "num_tokens": 8567321.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 810.0, "completions/max_terminated_length": 810.0, "completions/mean_length": 349.25, "completions/mean_terminated_length": 349.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6471296668004818, "frac_reward_zero_std": 0.5, "grad_norm": 0.7358757338867243, "kl": 0.002895355224609375, "learning_rate": 4.009068743476418e-07, "loss": 0.0321, "num_tokens": 8571627.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 461.375, "completions/mean_terminated_length": 461.375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.647397296935635, "frac_reward_zero_std": 0.5, "grad_norm": 0.5980160709240429, "kl": 0.0020599365234375, "learning_rate": 4.005103088489149e-07, "loss": 0.0828, "num_tokens": 8576690.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 279.5, "completions/mean_terminated_length": 279.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6476649270707882, "frac_reward_zero_std": 1.0, "grad_norm": 0.018192522514005324, "kl": 0.00144195556640625, "learning_rate": 4.00113873804135e-07, "loss": 0.0001, "num_tokens": 8579886.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 289.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6479325572059413, "frac_reward_zero_std": 1.0, "grad_norm": 0.034718947766658494, "kl": 0.00241851806640625, "learning_rate": 3.9971756955925584e-07, "loss": 0.0001, "num_tokens": 8583138.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 261.0, "completions/mean_terminated_length": 261.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6482001873410946, "frac_reward_zero_std": 1.0, "grad_norm": 0.05631465642236148, "kl": 0.0049896240234375, "learning_rate": 3.9932139646011697e-07, "loss": 0.0002, "num_tokens": 8586390.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6484678174762478, "frac_reward_zero_std": 1.0, "grad_norm": 0.017090712126663143, "kl": 0.0014495849609375, "learning_rate": 3.989253548524439e-07, "loss": 0.0001, "num_tokens": 8589583.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 281.875, "completions/mean_terminated_length": 281.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.648735447611401, "frac_reward_zero_std": 1.0, "grad_norm": 0.02394121210917771, "kl": 0.0015716552734375, "learning_rate": 3.9852944508184647e-07, "loss": 0.0001, "num_tokens": 8593022.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.6490030777465543, "frac_reward_zero_std": 1.0, "grad_norm": 0.028224586190354332, "kl": 0.001598358154296875, "learning_rate": 3.9813366749382025e-07, "loss": 0.0001, "num_tokens": 8596707.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 448.625, "completions/mean_terminated_length": 448.625, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.6492707078817075, "frac_reward_zero_std": 0.5, "grad_norm": 0.8704104037999982, "kl": 0.001041412353515625, "learning_rate": 3.9773802243374555e-07, "loss": 0.0607, "num_tokens": 8601524.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 320.75, "completions/mean_terminated_length": 320.75, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6495383380168607, "frac_reward_zero_std": 1.0, "grad_norm": 0.025348247045277692, "kl": 0.00183868408203125, "learning_rate": 3.9734251024688637e-07, "loss": 0.0001, "num_tokens": 8605202.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.649805968152014, "frac_reward_zero_std": 1.0, "grad_norm": 0.018789194378480145, "kl": 0.00168609619140625, "learning_rate": 3.969471312783912e-07, "loss": 0.0001, "num_tokens": 8608619.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6500735982871672, "frac_reward_zero_std": 0.5, "grad_norm": 0.7801371470618444, "kl": 0.001720428466796875, "learning_rate": 3.96551885873292e-07, "loss": -0.0409, "num_tokens": 8611661.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 223.375, "completions/mean_terminated_length": 223.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.6503412284223203, "frac_reward_zero_std": 1.0, "grad_norm": 0.015784305435372417, "kl": 0.001033782958984375, "learning_rate": 3.9615677437650463e-07, "loss": 0.0, "num_tokens": 8614336.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6506088585574735, "frac_reward_zero_std": 1.0, "grad_norm": 0.014918044371669733, "kl": 0.001224517822265625, "learning_rate": 3.957617971328275e-07, "loss": 0.0, "num_tokens": 8617954.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 207.75, "completions/mean_terminated_length": 207.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6508764886926268, "frac_reward_zero_std": 1.0, "grad_norm": 0.01158588070758511, "kl": 0.0007572174072265625, "learning_rate": 3.9536695448694255e-07, "loss": 0.0, "num_tokens": 8620480.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 283.125, "completions/mean_terminated_length": 283.125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.65114411882778, "frac_reward_zero_std": 0.5, "grad_norm": 0.7411007979292967, "kl": 0.001922607421875, "learning_rate": 3.9497224678341356e-07, "loss": -0.0215, "num_tokens": 8623805.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 233.0, "completions/max_terminated_length": 233.0, "completions/mean_length": 207.25, "completions/mean_terminated_length": 207.25, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6514117489629332, "frac_reward_zero_std": 1.0, "grad_norm": 0.018783099227299663, "kl": 0.001789093017578125, "learning_rate": 3.9457767436668687e-07, "loss": 0.0001, "num_tokens": 8626431.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 285.25, "completions/mean_terminated_length": 285.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.6516793790980865, "frac_reward_zero_std": 1.0, "grad_norm": 0.021700799895393835, "kl": 0.001590728759765625, "learning_rate": 3.9418323758109117e-07, "loss": 0.0001, "num_tokens": 8629793.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 397.125, "completions/mean_terminated_length": 397.125, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.6519470092332397, "frac_reward_zero_std": 1.0, "grad_norm": 0.017439062624118158, "kl": 0.001682281494140625, "learning_rate": 3.937889367708358e-07, "loss": 0.0001, "num_tokens": 8634090.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 362.25, "completions/mean_terminated_length": 362.25, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.6522146393683929, "frac_reward_zero_std": 0.5, "grad_norm": 0.6344958679644455, "kl": 0.001476287841796875, "learning_rate": 3.9339477228001263e-07, "loss": 0.0001, "num_tokens": 8637868.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 298.625, "completions/mean_terminated_length": 298.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.6524822695035462, "frac_reward_zero_std": 0.5, "grad_norm": 0.6695197081557156, "kl": 0.00106048583984375, "learning_rate": 3.9300074445259346e-07, "loss": 0.0221, "num_tokens": 8641209.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 240.0, "completions/mean_terminated_length": 240.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.6527498996386993, "frac_reward_zero_std": 1.0, "grad_norm": 0.01756186999637442, "kl": 0.001041412353515625, "learning_rate": 3.9260685363243174e-07, "loss": 0.0, "num_tokens": 8644101.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6530175297738525, "frac_reward_zero_std": 1.0, "grad_norm": 0.025436172168914195, "kl": 0.0031585693359375, "learning_rate": 3.922131001632606e-07, "loss": 0.0001, "num_tokens": 8647324.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6532851599090057, "frac_reward_zero_std": 1.0, "grad_norm": 0.031382241612836995, "kl": 0.00197601318359375, "learning_rate": 3.9181948438869384e-07, "loss": 0.0001, "num_tokens": 8650753.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 284.875, "completions/mean_terminated_length": 284.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.653552790044159, "frac_reward_zero_std": 1.0, "grad_norm": 0.026388543996103752, "kl": 0.001678466796875, "learning_rate": 3.914260066522248e-07, "loss": 0.0001, "num_tokens": 8654080.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6538204201793122, "frac_reward_zero_std": 1.0, "grad_norm": 0.021282739896623208, "kl": 0.00243377685546875, "learning_rate": 3.9103266729722684e-07, "loss": 0.0001, "num_tokens": 8657776.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 255.875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6540880503144654, "frac_reward_zero_std": 1.0, "grad_norm": 0.02270110084620557, "kl": 0.001216888427734375, "learning_rate": 3.906394666669517e-07, "loss": 0.0, "num_tokens": 8660967.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 210.375, "completions/mean_terminated_length": 210.375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6543556804496187, "frac_reward_zero_std": 0.5, "grad_norm": 1.2001320582971338, "kl": 0.00135040283203125, "learning_rate": 3.902464051045308e-07, "loss": 0.0439, "num_tokens": 8663646.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.6546233105847719, "frac_reward_zero_std": 0.5, "grad_norm": 0.7577705101129747, "kl": 0.00201416015625, "learning_rate": 3.8985348295297416e-07, "loss": 0.0213, "num_tokens": 8667303.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 356.75, "completions/mean_terminated_length": 356.75, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.654890940719925, "frac_reward_zero_std": 1.0, "grad_norm": 0.027563761914250234, "kl": 0.001499176025390625, "learning_rate": 3.8946070055516953e-07, "loss": 0.0001, "num_tokens": 8671365.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 322.75, "completions/mean_terminated_length": 322.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.6551585708550783, "frac_reward_zero_std": 0.5, "grad_norm": 1.145262165030585, "kl": 0.001728057861328125, "learning_rate": 3.8906805825388343e-07, "loss": 0.0001, "num_tokens": 8674979.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 695.75, "completions/mean_terminated_length": 695.75, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "epoch": 0.6554262009902315, "frac_reward_zero_std": 0.5, "grad_norm": 0.5998284439125, "kl": 0.0019989013671875, "learning_rate": 3.8867555639175966e-07, "loss": 0.0001, "num_tokens": 8682109.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6556938311253847, "frac_reward_zero_std": 1.0, "grad_norm": 0.009911962127506833, "kl": 0.0007915496826171875, "learning_rate": 3.882831953113197e-07, "loss": 0.0, "num_tokens": 8684921.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.6559614612605379, "frac_reward_zero_std": 1.0, "grad_norm": 0.020257335153640303, "kl": 0.001373291015625, "learning_rate": 3.878909753549621e-07, "loss": 0.0001, "num_tokens": 8688401.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 356.375, "completions/mean_terminated_length": 356.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6562290913956912, "frac_reward_zero_std": 1.0, "grad_norm": 0.018743625706901915, "kl": 0.0017852783203125, "learning_rate": 3.8749889686496216e-07, "loss": 0.0001, "num_tokens": 8692452.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.6564967215308444, "frac_reward_zero_std": 1.0, "grad_norm": 0.019108884264177766, "kl": 0.001628875732421875, "learning_rate": 3.871069601834718e-07, "loss": 0.0001, "num_tokens": 8696066.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 246.375, "completions/mean_terminated_length": 246.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6567643516659976, "frac_reward_zero_std": 1.0, "grad_norm": 0.006249476030518312, "kl": 0.0003032684326171875, "learning_rate": 3.86715165652519e-07, "loss": 0.0, "num_tokens": 8698981.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.6570319818011509, "frac_reward_zero_std": 0.5, "grad_norm": 1.0478388339868465, "kl": 0.00341033935546875, "learning_rate": 3.863235136140086e-07, "loss": -0.0264, "num_tokens": 8702830.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.657299611936304, "frac_reward_zero_std": 1.0, "grad_norm": 0.021703133474035432, "kl": 0.002227783203125, "learning_rate": 3.859320044097197e-07, "loss": 0.0001, "num_tokens": 8706911.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6575672420714572, "frac_reward_zero_std": 1.0, "grad_norm": 0.0187568446075642, "kl": 0.0011444091796875, "learning_rate": 3.855406383813077e-07, "loss": 0.0, "num_tokens": 8710246.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 268.875, "completions/mean_terminated_length": 268.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6578348722066104, "frac_reward_zero_std": 1.0, "grad_norm": 0.017273706611105784, "kl": 0.001678466796875, "learning_rate": 3.851494158703026e-07, "loss": 0.0001, "num_tokens": 8713425.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.6581025023417637, "frac_reward_zero_std": 1.0, "grad_norm": 0.027978248395499023, "kl": 0.0016937255859375, "learning_rate": 3.847583372181097e-07, "loss": 0.0001, "num_tokens": 8716481.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 646.0, "completions/max_terminated_length": 646.0, "completions/mean_length": 458.375, "completions/mean_terminated_length": 458.375, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "epoch": 0.6583701324769169, "frac_reward_zero_std": 1.0, "grad_norm": 0.01939744544591882, "kl": 0.001361846923828125, "learning_rate": 3.84367402766008e-07, "loss": 0.0001, "num_tokens": 8721184.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.6586377626120701, "frac_reward_zero_std": 1.0, "grad_norm": 0.015977953487696747, "kl": 0.0018768310546875, "learning_rate": 3.8397661285515146e-07, "loss": 0.0001, "num_tokens": 8724476.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 264.75, "completions/mean_terminated_length": 264.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6589053927472234, "frac_reward_zero_std": 1.0, "grad_norm": 0.017697634688642418, "kl": 0.00138092041015625, "learning_rate": 3.8358596782656725e-07, "loss": 0.0001, "num_tokens": 8727694.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 381.375, "completions/mean_terminated_length": 381.375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6591730228823766, "frac_reward_zero_std": 1.0, "grad_norm": 0.013221646649185527, "kl": 0.001255035400390625, "learning_rate": 3.831954680211567e-07, "loss": 0.0001, "num_tokens": 8731917.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 257.875, "completions/mean_terminated_length": 257.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6594406530175297, "frac_reward_zero_std": 1.0, "grad_norm": 0.03716632277091473, "kl": 0.0026397705078125, "learning_rate": 3.828051137796935e-07, "loss": 0.0001, "num_tokens": 8735012.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 283.625, "completions/mean_terminated_length": 283.625, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.659708283152683, "frac_reward_zero_std": 1.0, "grad_norm": 0.043975979644434456, "kl": 0.002902984619140625, "learning_rate": 3.824149054428253e-07, "loss": 0.0001, "num_tokens": 8738213.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 320.5, "completions/mean_terminated_length": 320.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.6599759132878362, "frac_reward_zero_std": 1.0, "grad_norm": 0.014262038489368637, "kl": 0.0011959075927734375, "learning_rate": 3.820248433510721e-07, "loss": 0.0, "num_tokens": 8741837.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 329.875, "completions/mean_terminated_length": 329.875, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.6602435434229894, "frac_reward_zero_std": 1.0, "grad_norm": 0.04679306906462755, "kl": 0.00262451171875, "learning_rate": 3.81634927844826e-07, "loss": 0.0001, "num_tokens": 8745600.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 260.125, "completions/mean_terminated_length": 260.125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6605111735581426, "frac_reward_zero_std": 1.0, "grad_norm": 0.01993832957657672, "kl": 0.001316070556640625, "learning_rate": 3.8124515926435154e-07, "loss": 0.0001, "num_tokens": 8748585.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.6607788036932959, "frac_reward_zero_std": 1.0, "grad_norm": 0.02431569400191077, "kl": 0.00179290771484375, "learning_rate": 3.8085553794978466e-07, "loss": 0.0001, "num_tokens": 8751692.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 199.375, "completions/mean_terminated_length": 199.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6610464338284491, "frac_reward_zero_std": 1.0, "grad_norm": 0.023194146083067486, "kl": 0.0012187957763671875, "learning_rate": 3.8046606424113327e-07, "loss": 0.0, "num_tokens": 8754295.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6613140639636023, "frac_reward_zero_std": 1.0, "grad_norm": 0.014922203344372094, "kl": 0.001495361328125, "learning_rate": 3.800767384782759e-07, "loss": 0.0001, "num_tokens": 8757272.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.6615816940987556, "frac_reward_zero_std": 0.5, "grad_norm": 1.0313846491687317, "kl": 0.00231170654296875, "learning_rate": 3.796875610009627e-07, "loss": -0.0395, "num_tokens": 8760276.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 379.125, "completions/mean_terminated_length": 379.125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.6618493242339087, "frac_reward_zero_std": 1.0, "grad_norm": 0.013249032127114712, "kl": 0.001316070556640625, "learning_rate": 3.7929853214881346e-07, "loss": 0.0001, "num_tokens": 8764621.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 234.375, "completions/mean_terminated_length": 234.375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.6621169543690619, "frac_reward_zero_std": 1.0, "grad_norm": 0.03405374843846355, "kl": 0.00217437744140625, "learning_rate": 3.789096522613191e-07, "loss": 0.0001, "num_tokens": 8767476.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 252.875, "completions/mean_terminated_length": 252.875, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.6623845845042152, "frac_reward_zero_std": 1.0, "grad_norm": 0.01727529016116052, "kl": 0.00164794921875, "learning_rate": 3.7852092167784053e-07, "loss": 0.0001, "num_tokens": 8770495.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 412.375, "completions/mean_terminated_length": 412.375, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.6626522146393684, "frac_reward_zero_std": 1.0, "grad_norm": 0.011015429773371945, "kl": 0.000858306884765625, "learning_rate": 3.781323407376076e-07, "loss": 0.0, "num_tokens": 8774926.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.6629198447745216, "frac_reward_zero_std": 1.0, "grad_norm": 0.01871855814582075, "kl": 0.0015106201171875, "learning_rate": 3.777439097797204e-07, "loss": 0.0001, "num_tokens": 8778521.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 223.875, "completions/mean_terminated_length": 223.875, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6631874749096748, "frac_reward_zero_std": 1.0, "grad_norm": 0.01314509878610703, "kl": 0.0008897781372070312, "learning_rate": 3.773556291431476e-07, "loss": 0.0, "num_tokens": 8781248.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 290.875, "completions/mean_terminated_length": 290.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.6634551050448281, "frac_reward_zero_std": 1.0, "grad_norm": 0.029417943082099846, "kl": 0.001842498779296875, "learning_rate": 3.76967499166727e-07, "loss": 0.0001, "num_tokens": 8784539.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6637227351799813, "frac_reward_zero_std": 1.0, "grad_norm": 0.017160339007229578, "kl": 0.001384735107421875, "learning_rate": 3.765795201891647e-07, "loss": 0.0001, "num_tokens": 8787991.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 360.5, "completions/mean_terminated_length": 360.5, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.6639903653151344, "frac_reward_zero_std": 0.0, "grad_norm": 0.9787492894063033, "kl": 0.001888275146484375, "learning_rate": 3.761916925490355e-07, "loss": -0.0894, "num_tokens": 8791891.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 276.75, "completions/mean_terminated_length": 276.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6642579954502877, "frac_reward_zero_std": 1.0, "grad_norm": 0.012882314642313, "kl": 0.001255035400390625, "learning_rate": 3.758040165847811e-07, "loss": 0.0001, "num_tokens": 8795309.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.6645256255854409, "frac_reward_zero_std": 1.0, "grad_norm": 0.019092340699859652, "kl": 0.00182342529296875, "learning_rate": 3.754164926347122e-07, "loss": 0.0001, "num_tokens": 8799089.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6647932557205941, "frac_reward_zero_std": 1.0, "grad_norm": 0.023605473900008632, "kl": 0.00201416015625, "learning_rate": 3.750291210370057e-07, "loss": 0.0001, "num_tokens": 8802312.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.6650608858557474, "frac_reward_zero_std": 1.0, "grad_norm": 0.03159296236677779, "kl": 0.001544952392578125, "learning_rate": 3.7464190212970625e-07, "loss": 0.0001, "num_tokens": 8805731.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6653285159909006, "frac_reward_zero_std": 1.0, "grad_norm": 0.014722102201546313, "kl": 0.001598358154296875, "learning_rate": 3.7425483625072506e-07, "loss": 0.0001, "num_tokens": 8809805.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.6655961461260538, "frac_reward_zero_std": 1.0, "grad_norm": 0.02355898977115781, "kl": 0.0017852783203125, "learning_rate": 3.738679237378395e-07, "loss": 0.0001, "num_tokens": 8813417.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 365.375, "completions/mean_terminated_length": 365.375, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.665863776261207, "frac_reward_zero_std": 1.0, "grad_norm": 0.007161757264076189, "kl": 0.0005311965942382812, "learning_rate": 3.734811649286935e-07, "loss": 0.0, "num_tokens": 8817424.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6661314063963603, "frac_reward_zero_std": 1.0, "grad_norm": 0.018272420027834153, "kl": 0.00234222412109375, "learning_rate": 3.7309456016079664e-07, "loss": 0.0001, "num_tokens": 8821014.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 232.5, "completions/mean_terminated_length": 232.5, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6663990365315134, "frac_reward_zero_std": 1.0, "grad_norm": 0.02165959560096375, "kl": 0.00183868408203125, "learning_rate": 3.7270810977152433e-07, "loss": 0.0001, "num_tokens": 8823858.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 421.75, "completions/mean_terminated_length": 421.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.6666666666666666, "frac_reward_zero_std": 1.0, "grad_norm": 0.011782126994267186, "kl": 0.0010223388671875, "learning_rate": 3.723218140981169e-07, "loss": 0.0, "num_tokens": 8828256.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.6669342968018199, "frac_reward_zero_std": 1.0, "grad_norm": 0.016209004233313735, "kl": 0.001491546630859375, "learning_rate": 3.7193567347767995e-07, "loss": 0.0001, "num_tokens": 8831479.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 362.875, "completions/mean_terminated_length": 362.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.6672019269369731, "frac_reward_zero_std": 1.0, "grad_norm": 0.009705622358724875, "kl": 0.0008640289306640625, "learning_rate": 3.7154968824718337e-07, "loss": 0.0, "num_tokens": 8835278.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.6674695570721263, "frac_reward_zero_std": 1.0, "grad_norm": 0.034574717112579846, "kl": 0.002349853515625, "learning_rate": 3.711638587434619e-07, "loss": 0.0001, "num_tokens": 8838155.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 396.75, "completions/mean_terminated_length": 396.75, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.6677371872072796, "frac_reward_zero_std": 0.5, "grad_norm": 0.9922635219585216, "kl": 0.00168609619140625, "learning_rate": 3.7077818530321447e-07, "loss": -0.0449, "num_tokens": 8842613.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 371.5, "completions/mean_terminated_length": 371.5, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.6680048173424328, "frac_reward_zero_std": 1.0, "grad_norm": 0.03298207362119831, "kl": 0.0016422271728515625, "learning_rate": 3.703926682630034e-07, "loss": 0.0001, "num_tokens": 8846665.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.668272447477586, "frac_reward_zero_std": 1.0, "grad_norm": 0.024821687875074792, "kl": 0.002277374267578125, "learning_rate": 3.7000730795925464e-07, "loss": 0.0001, "num_tokens": 8849522.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 186.75, "completions/mean_terminated_length": 186.75, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.6685400776127391, "frac_reward_zero_std": 1.0, "grad_norm": 0.07208382187952336, "kl": 0.002841949462890625, "learning_rate": 3.696221047282573e-07, "loss": 0.0001, "num_tokens": 8852032.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.6688077077478924, "frac_reward_zero_std": 1.0, "grad_norm": 0.014891385540289983, "kl": 0.0012054443359375, "learning_rate": 3.6923705890616385e-07, "loss": 0.0, "num_tokens": 8855521.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 293.125, "completions/mean_terminated_length": 293.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.6690753378830456, "frac_reward_zero_std": 1.0, "grad_norm": 0.02212110849679341, "kl": 0.00270843505859375, "learning_rate": 3.6885217082898866e-07, "loss": 0.0001, "num_tokens": 8858838.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 237.5, "completions/mean_terminated_length": 237.5, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.6693429680181988, "frac_reward_zero_std": 1.0, "grad_norm": 0.18980872145635894, "kl": 0.004688262939453125, "learning_rate": 3.684674408326094e-07, "loss": 0.0002, "num_tokens": 8861778.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 280.25, "completions/mean_terminated_length": 280.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6696105981533521, "frac_reward_zero_std": 1.0, "grad_norm": 0.0155711827827792, "kl": 0.001476287841796875, "learning_rate": 3.6808286925276477e-07, "loss": 0.0001, "num_tokens": 8865020.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6698782282885053, "frac_reward_zero_std": 1.0, "grad_norm": 0.013840657566815576, "kl": 0.001544952392578125, "learning_rate": 3.676984564250559e-07, "loss": 0.0001, "num_tokens": 8868335.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 576.0, "completions/max_terminated_length": 576.0, "completions/mean_length": 370.25, "completions/mean_terminated_length": 370.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.6701458584236585, "frac_reward_zero_std": 0.5, "grad_norm": 0.6077866456190967, "kl": 0.0013294219970703125, "learning_rate": 3.673142026849454e-07, "loss": -0.0086, "num_tokens": 8872401.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 306.5, "completions/mean_terminated_length": 306.5, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.6704134885588118, "frac_reward_zero_std": 1.0, "grad_norm": 0.022007812379670266, "kl": 0.002040863037109375, "learning_rate": 3.669301083677563e-07, "loss": 0.0001, "num_tokens": 8875877.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.670681118693965, "frac_reward_zero_std": 1.0, "grad_norm": 0.036430561650872724, "kl": 0.002655029296875, "learning_rate": 3.665461738086737e-07, "loss": 0.0001, "num_tokens": 8879171.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 232.0, "completions/max_terminated_length": 232.0, "completions/mean_length": 196.875, "completions/mean_terminated_length": 196.875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6709487488291181, "frac_reward_zero_std": 1.0, "grad_norm": 0.022939292840217552, "kl": 0.00162506103515625, "learning_rate": 3.6616239934274205e-07, "loss": 0.0001, "num_tokens": 8881638.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.6712163789642713, "frac_reward_zero_std": 1.0, "grad_norm": 0.015767997460042916, "kl": 0.001590728759765625, "learning_rate": 3.6577878530486703e-07, "loss": 0.0001, "num_tokens": 8884458.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6714840090994246, "frac_reward_zero_std": 1.0, "grad_norm": 0.02174619864636362, "kl": 0.001873016357421875, "learning_rate": 3.65395332029814e-07, "loss": 0.0001, "num_tokens": 8887558.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 369.125, "completions/mean_terminated_length": 369.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6717516392345778, "frac_reward_zero_std": 1.0, "grad_norm": 0.03639338304433031, "kl": 0.00183868408203125, "learning_rate": 3.6501203985220784e-07, "loss": 0.0001, "num_tokens": 8891571.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 312.125, "completions/mean_terminated_length": 312.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.672019269369731, "frac_reward_zero_std": 0.5, "grad_norm": 0.7890991837536673, "kl": 0.00141143798828125, "learning_rate": 3.646289091065329e-07, "loss": 0.0151, "num_tokens": 8895188.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 271.5, "completions/mean_terminated_length": 271.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.6722868995048843, "frac_reward_zero_std": 1.0, "grad_norm": 0.015335528689598334, "kl": 0.00160980224609375, "learning_rate": 3.6424594012713307e-07, "loss": 0.0001, "num_tokens": 8898480.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 693.0, "completions/mean_length": 566.75, "completions/mean_terminated_length": 501.4285888671875, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "epoch": 0.6725545296400375, "frac_reward_zero_std": 0.5, "grad_norm": 0.5022282259745946, "kl": 0.0012798309326171875, "learning_rate": 3.6386313324821026e-07, "loss": 0.0583, "num_tokens": 8904150.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.6728221597751907, "frac_reward_zero_std": 1.0, "grad_norm": 0.030407366403547496, "kl": 0.0020809173583984375, "learning_rate": 3.63480488803826e-07, "loss": 0.0001, "num_tokens": 8907094.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 285.75, "completions/mean_terminated_length": 285.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6730897899103438, "frac_reward_zero_std": 1.0, "grad_norm": 0.016407345781888467, "kl": 0.00133514404296875, "learning_rate": 3.6309800712789926e-07, "loss": 0.0001, "num_tokens": 8910396.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 245.125, "completions/mean_terminated_length": 245.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.6733574200454971, "frac_reward_zero_std": 1.0, "grad_norm": 0.024142424193823075, "kl": 0.00157928466796875, "learning_rate": 3.6271568855420697e-07, "loss": 0.0001, "num_tokens": 8913497.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 433.625, "completions/mean_terminated_length": 433.625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.6736250501806503, "frac_reward_zero_std": 1.0, "grad_norm": 0.014075961071925587, "kl": 0.00165557861328125, "learning_rate": 3.623335334163843e-07, "loss": 0.0001, "num_tokens": 8918010.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 336.0, "completions/mean_terminated_length": 336.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.6738926803158035, "frac_reward_zero_std": 1.0, "grad_norm": 0.2374659523890329, "kl": 0.004573822021484375, "learning_rate": 3.6195154204792334e-07, "loss": 0.0002, "num_tokens": 8921842.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 305.375, "completions/mean_terminated_length": 305.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.6741603104509568, "frac_reward_zero_std": 1.0, "grad_norm": 0.019846837975839096, "kl": 0.0020904541015625, "learning_rate": 3.6156971478217346e-07, "loss": 0.0001, "num_tokens": 8925405.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.67442794058611, "frac_reward_zero_std": 1.0, "grad_norm": 0.014361072557414024, "kl": 0.001308441162109375, "learning_rate": 3.6118805195234026e-07, "loss": 0.0001, "num_tokens": 8928685.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.6746955707212632, "frac_reward_zero_std": 1.0, "grad_norm": 0.013907572371253254, "kl": 0.000988006591796875, "learning_rate": 3.6080655389148704e-07, "loss": 0.0, "num_tokens": 8932262.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 265.375, "completions/mean_terminated_length": 265.375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.6749632008564165, "frac_reward_zero_std": 0.5, "grad_norm": 1.1374328772435396, "kl": 0.0034332275390625, "learning_rate": 3.604252209325319e-07, "loss": -0.044, "num_tokens": 8935625.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.6752308309915697, "frac_reward_zero_std": 1.0, "grad_norm": 0.01819647475155997, "kl": 0.001461029052734375, "learning_rate": 3.6004405340825005e-07, "loss": 0.0001, "num_tokens": 8939118.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 341.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.6754984611267228, "frac_reward_zero_std": 1.0, "grad_norm": 0.01126305542404046, "kl": 0.0007114410400390625, "learning_rate": 3.5966305165127197e-07, "loss": 0.0, "num_tokens": 8942935.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 545.0, "completions/max_terminated_length": 545.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.675766091261876, "frac_reward_zero_std": 1.0, "grad_norm": 0.02504008190305208, "kl": 0.000926971435546875, "learning_rate": 3.592822159940825e-07, "loss": 0.0, "num_tokens": 8946486.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 484.0, "completions/max_terminated_length": 484.0, "completions/mean_length": 421.375, "completions/mean_terminated_length": 421.375, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.6760337213970293, "frac_reward_zero_std": 0.5, "grad_norm": 0.7167910007699252, "kl": 0.00145721435546875, "learning_rate": 3.589015467690234e-07, "loss": 0.0012, "num_tokens": 8951109.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 241.0, "completions/max_terminated_length": 241.0, "completions/mean_length": 224.625, "completions/mean_terminated_length": 224.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6763013515321825, "frac_reward_zero_std": 1.0, "grad_norm": 0.02076495573977773, "kl": 0.00191497802734375, "learning_rate": 3.5852104430828943e-07, "loss": 0.0001, "num_tokens": 8953878.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.6765689816673357, "frac_reward_zero_std": 1.0, "grad_norm": 0.03931575778399644, "kl": 0.00312042236328125, "learning_rate": 3.581407089439311e-07, "loss": 0.0001, "num_tokens": 8956462.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.676836611802489, "frac_reward_zero_std": 1.0, "grad_norm": 0.012412692015062642, "kl": 0.0011425018310546875, "learning_rate": 3.5776054100785226e-07, "loss": 0.0, "num_tokens": 8959676.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6771042419376422, "frac_reward_zero_std": 1.0, "grad_norm": 0.022929622206143835, "kl": 0.00185394287109375, "learning_rate": 3.573805408318109e-07, "loss": 0.0001, "num_tokens": 8963336.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 253.625, "completions/mean_terminated_length": 253.625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.6773718720727954, "frac_reward_zero_std": 0.5, "grad_norm": 1.2629499786069616, "kl": 0.002498626708984375, "learning_rate": 3.570007087474188e-07, "loss": -0.0088, "num_tokens": 8966569.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 300.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6776395022079487, "frac_reward_zero_std": 1.0, "grad_norm": 0.011187040789934561, "kl": 0.001071929931640625, "learning_rate": 3.56621045086141e-07, "loss": 0.0, "num_tokens": 8969996.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 314.125, "completions/mean_terminated_length": 314.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6779071323431018, "frac_reward_zero_std": 1.0, "grad_norm": 0.018793484508161512, "kl": 0.0009765625, "learning_rate": 3.5624155017929525e-07, "loss": 0.0, "num_tokens": 8973501.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.678174762478255, "frac_reward_zero_std": 1.0, "grad_norm": 0.019641107295164578, "kl": 0.001857757568359375, "learning_rate": 3.5586222435805235e-07, "loss": 0.0001, "num_tokens": 8976371.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 338.25, "completions/mean_terminated_length": 338.25, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.6784423926134082, "frac_reward_zero_std": 1.0, "grad_norm": 0.02027486409214013, "kl": 0.0023651123046875, "learning_rate": 3.5548306795343565e-07, "loss": 0.0001, "num_tokens": 8980101.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.6787100227485615, "frac_reward_zero_std": 1.0, "grad_norm": 0.015692346561833635, "kl": 0.00127410888671875, "learning_rate": 3.551040812963203e-07, "loss": 0.0001, "num_tokens": 8983371.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6789776528837147, "frac_reward_zero_std": 1.0, "grad_norm": 0.013859171738368035, "kl": 0.00128936767578125, "learning_rate": 3.547252647174335e-07, "loss": 0.0001, "num_tokens": 8986743.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 357.25, "completions/mean_terminated_length": 357.25, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.6792452830188679, "frac_reward_zero_std": 1.0, "grad_norm": 0.009674341316967275, "kl": 0.000965118408203125, "learning_rate": 3.54346618547354e-07, "loss": 0.0, "num_tokens": 8990577.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.6795129131540212, "frac_reward_zero_std": 1.0, "grad_norm": 0.013172126453073009, "kl": 0.0009307861328125, "learning_rate": 3.53968143116512e-07, "loss": 0.0, "num_tokens": 8993754.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6797805432891744, "frac_reward_zero_std": 1.0, "grad_norm": 0.026770215940364203, "kl": 0.001895904541015625, "learning_rate": 3.5358983875518845e-07, "loss": 0.0001, "num_tokens": 8997045.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 401.625, "completions/mean_terminated_length": 401.625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.6800481734243276, "frac_reward_zero_std": 0.5, "grad_norm": 0.7239038810215636, "kl": 0.0020751953125, "learning_rate": 3.532117057935151e-07, "loss": 0.0279, "num_tokens": 9001382.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 341.0, "completions/mean_terminated_length": 341.0, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6803158035594808, "frac_reward_zero_std": 0.5, "grad_norm": 0.5863620071151637, "kl": 0.001567840576171875, "learning_rate": 3.528337445614742e-07, "loss": 0.0056, "num_tokens": 9005286.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.680583433694634, "frac_reward_zero_std": 1.0, "grad_norm": 0.015984173506759014, "kl": 0.00177001953125, "learning_rate": 3.5245595538889804e-07, "loss": 0.0001, "num_tokens": 9008578.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.6808510638297872, "frac_reward_zero_std": 1.0, "grad_norm": 0.027291117946922152, "kl": 0.0031890869140625, "learning_rate": 3.5207833860546886e-07, "loss": 0.0001, "num_tokens": 9012910.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6811186939649404, "frac_reward_zero_std": 1.0, "grad_norm": 0.020961493841860992, "kl": 0.001583099365234375, "learning_rate": 3.517008945407185e-07, "loss": 0.0001, "num_tokens": 9016046.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 303.875, "completions/mean_terminated_length": 303.875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.6813863241000937, "frac_reward_zero_std": 0.5, "grad_norm": 0.730495273060697, "kl": 0.0017242431640625, "learning_rate": 3.513236235240278e-07, "loss": -0.0111, "num_tokens": 9019537.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 308.125, "completions/mean_terminated_length": 308.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6816539542352469, "frac_reward_zero_std": 0.5, "grad_norm": 0.8648763921489093, "kl": 0.001617431640625, "learning_rate": 3.509465258846268e-07, "loss": 0.0001, "num_tokens": 9023018.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 224.5, "completions/mean_terminated_length": 224.5, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6819215843704001, "frac_reward_zero_std": 1.0, "grad_norm": 0.018272335278147273, "kl": 0.001373291015625, "learning_rate": 3.505696019515946e-07, "loss": 0.0001, "num_tokens": 9025758.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 501.0, "completions/max_terminated_length": 501.0, "completions/mean_length": 358.375, "completions/mean_terminated_length": 358.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.6821892145055534, "frac_reward_zero_std": 1.0, "grad_norm": 0.01584068607246318, "kl": 0.0019378662109375, "learning_rate": 3.501928520538575e-07, "loss": 0.0001, "num_tokens": 9029809.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 247.625, "completions/mean_terminated_length": 247.625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.6824568446407066, "frac_reward_zero_std": 1.0, "grad_norm": 0.0269055159373369, "kl": 0.00201416015625, "learning_rate": 3.4981627652019177e-07, "loss": 0.0001, "num_tokens": 9032826.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.6827244747758597, "frac_reward_zero_std": 0.5, "grad_norm": 0.9863645524338626, "kl": 0.005840301513671875, "learning_rate": 3.494398756792196e-07, "loss": 0.0002, "num_tokens": 9036471.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 385.125, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.682992104911013, "frac_reward_zero_std": 1.0, "grad_norm": 0.014753003712521333, "kl": 0.001361846923828125, "learning_rate": 3.4906364985941215e-07, "loss": 0.0001, "num_tokens": 9040632.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.6832597350461662, "frac_reward_zero_std": 1.0, "grad_norm": 0.011916967234728819, "kl": 0.00092315673828125, "learning_rate": 3.486875993890874e-07, "loss": 0.0, "num_tokens": 9044217.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.6835273651813194, "frac_reward_zero_std": 1.0, "grad_norm": 0.016594618315557967, "kl": 0.001232147216796875, "learning_rate": 3.483117245964097e-07, "loss": 0.0, "num_tokens": 9046990.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 346.125, "completions/mean_terminated_length": 346.125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.6837949953164726, "frac_reward_zero_std": 0.5, "grad_norm": 0.8576169022926942, "kl": 0.00118255615234375, "learning_rate": 3.4793602580939134e-07, "loss": 0.0334, "num_tokens": 9051091.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 268.0, "completions/max_terminated_length": 268.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.6840626254516259, "frac_reward_zero_std": 1.0, "grad_norm": 0.011748530099851526, "kl": 0.00115966796875, "learning_rate": 3.475605033558896e-07, "loss": 0.0, "num_tokens": 9053933.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 294.25, "completions/mean_terminated_length": 294.25, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.6843302555867791, "frac_reward_zero_std": 1.0, "grad_norm": 0.009748748784779986, "kl": 0.0008716583251953125, "learning_rate": 3.4718515756360935e-07, "loss": 0.0, "num_tokens": 9057371.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 298.0, "completions/mean_terminated_length": 298.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6845978857219323, "frac_reward_zero_std": 1.0, "grad_norm": 0.016224693785154676, "kl": 0.00086212158203125, "learning_rate": 3.4680998876009987e-07, "loss": 0.0, "num_tokens": 9060675.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 348.875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.6848655158570855, "frac_reward_zero_std": 1.0, "grad_norm": 0.014396965352935014, "kl": 0.000659942626953125, "learning_rate": 3.46434997272757e-07, "loss": 0.0, "num_tokens": 9064486.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.6851331459922387, "frac_reward_zero_std": 1.0, "grad_norm": 0.029225986115758926, "kl": 0.00205230712890625, "learning_rate": 3.4606018342882133e-07, "loss": 0.0001, "num_tokens": 9067699.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 368.125, "completions/mean_terminated_length": 368.125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.6854007761273919, "frac_reward_zero_std": 0.5, "grad_norm": 0.613074513323333, "kl": 0.001739501953125, "learning_rate": 3.456855475553787e-07, "loss": 0.0516, "num_tokens": 9071772.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 244.5, "completions/mean_terminated_length": 244.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.6856684062625452, "frac_reward_zero_std": 1.0, "grad_norm": 0.03165017405354233, "kl": 0.0014667510986328125, "learning_rate": 3.4531108997935955e-07, "loss": 0.0001, "num_tokens": 9074732.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6859360363976984, "frac_reward_zero_std": 1.0, "grad_norm": 0.012029749973561366, "kl": 0.00109100341796875, "learning_rate": 3.4493681102753846e-07, "loss": 0.0, "num_tokens": 9078439.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 275.375, "completions/mean_terminated_length": 275.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6862036665328516, "frac_reward_zero_std": 1.0, "grad_norm": 0.017156672852332393, "kl": 0.00103759765625, "learning_rate": 3.4456271102653495e-07, "loss": 0.0, "num_tokens": 9081734.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.6864712966680048, "frac_reward_zero_std": 1.0, "grad_norm": 0.025793746660643133, "kl": 0.002193450927734375, "learning_rate": 3.441887903028113e-07, "loss": 0.0001, "num_tokens": 9085203.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 285.375, "completions/mean_terminated_length": 285.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6867389268031581, "frac_reward_zero_std": 1.0, "grad_norm": 0.01958507447867715, "kl": 0.001079559326171875, "learning_rate": 3.43815049182674e-07, "loss": 0.0, "num_tokens": 9088582.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.6870065569383113, "frac_reward_zero_std": 1.0, "grad_norm": 0.07043837802617099, "kl": 0.003662109375, "learning_rate": 3.434414879922727e-07, "loss": 0.0001, "num_tokens": 9092522.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 276.875, "completions/mean_terminated_length": 276.875, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.6872741870734644, "frac_reward_zero_std": 1.0, "grad_norm": 0.023698665192797806, "kl": 0.00118255615234375, "learning_rate": 3.430681070575999e-07, "loss": 0.0, "num_tokens": 9095713.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 259.5, "completions/mean_terminated_length": 259.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.6875418172086177, "frac_reward_zero_std": 1.0, "grad_norm": 0.016183870295278818, "kl": 0.00135040283203125, "learning_rate": 3.42694906704491e-07, "loss": 0.0001, "num_tokens": 9098709.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 387.25, "completions/mean_terminated_length": 387.25, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.6878094473437709, "frac_reward_zero_std": 1.0, "grad_norm": 0.009557279499451525, "kl": 0.001049041748046875, "learning_rate": 3.4232188725862354e-07, "loss": 0.0, "num_tokens": 9102823.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 469.25, "completions/mean_terminated_length": 469.25, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.6880770774789241, "frac_reward_zero_std": 1.0, "grad_norm": 0.058822089322965364, "kl": 0.0040130615234375, "learning_rate": 3.419490490455176e-07, "loss": 0.0002, "num_tokens": 9107661.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 322.5, "completions/mean_terminated_length": 322.5, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.6883447076140774, "frac_reward_zero_std": 1.0, "grad_norm": 0.3633623170796719, "kl": 0.00452423095703125, "learning_rate": 3.4157639239053466e-07, "loss": 0.0002, "num_tokens": 9111333.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 328.75, "completions/mean_terminated_length": 328.75, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.6886123377492306, "frac_reward_zero_std": 1.0, "grad_norm": 0.011844675457978229, "kl": 0.0008373260498046875, "learning_rate": 3.4120391761887804e-07, "loss": 0.0, "num_tokens": 9115051.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 540.125, "completions/mean_terminated_length": 540.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.6888799678843838, "frac_reward_zero_std": 1.0, "grad_norm": 0.020458213025296705, "kl": 0.001720428466796875, "learning_rate": 3.4083162505559213e-07, "loss": 0.0001, "num_tokens": 9120352.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.689147598019537, "frac_reward_zero_std": 1.0, "grad_norm": 0.02292047342868892, "kl": 0.002017974853515625, "learning_rate": 3.4045951502556267e-07, "loss": 0.0001, "num_tokens": 9123408.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 528.0, "completions/max_terminated_length": 528.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.6894152281546903, "frac_reward_zero_std": 0.5, "grad_norm": 1.0231571509254058, "kl": 0.00299835205078125, "learning_rate": 3.4008758785351577e-07, "loss": 0.038, "num_tokens": 9127702.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 227.625, "completions/mean_terminated_length": 227.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6896828582898434, "frac_reward_zero_std": 1.0, "grad_norm": 0.017979342218260013, "kl": 0.00179290771484375, "learning_rate": 3.397158438640181e-07, "loss": 0.0001, "num_tokens": 9130579.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.6899504884249966, "frac_reward_zero_std": 1.0, "grad_norm": 0.024448017483627824, "kl": 0.00214385986328125, "learning_rate": 3.3934428338147616e-07, "loss": 0.0001, "num_tokens": 9133913.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 237.75, "completions/mean_terminated_length": 237.75, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6902181185601499, "frac_reward_zero_std": 1.0, "grad_norm": 0.02160053508131794, "kl": 0.001415252685546875, "learning_rate": 3.3897290673013714e-07, "loss": 0.0001, "num_tokens": 9136671.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 250.25, "completions/mean_terminated_length": 250.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.6904857486953031, "frac_reward_zero_std": 1.0, "grad_norm": 0.014830494645504753, "kl": 0.001148223876953125, "learning_rate": 3.3860171423408666e-07, "loss": 0.0, "num_tokens": 9139717.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 216.875, "completions/mean_terminated_length": 216.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6907533788304563, "frac_reward_zero_std": 1.0, "grad_norm": 0.010855511263878358, "kl": 0.000949859619140625, "learning_rate": 3.382307062172508e-07, "loss": 0.0, "num_tokens": 9142388.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 323.375, "completions/mean_terminated_length": 323.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.6910210089656095, "frac_reward_zero_std": 1.0, "grad_norm": 0.020294810522790217, "kl": 0.002532958984375, "learning_rate": 3.378598830033938e-07, "loss": 0.0001, "num_tokens": 9146031.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.6912886391007628, "frac_reward_zero_std": 0.5, "grad_norm": 0.5377666580396372, "kl": 0.00267791748046875, "learning_rate": 3.3748924491611875e-07, "loss": -0.0036, "num_tokens": 9149503.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.691556269235916, "frac_reward_zero_std": 1.0, "grad_norm": 0.022076252377813483, "kl": 0.001674652099609375, "learning_rate": 3.3711879227886787e-07, "loss": 0.0001, "num_tokens": 9152450.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 332.75, "completions/mean_terminated_length": 332.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.6918238993710691, "frac_reward_zero_std": 1.0, "grad_norm": 0.01488721656228416, "kl": 0.001285552978515625, "learning_rate": 3.3674852541492046e-07, "loss": 0.0001, "num_tokens": 9156160.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 291.0, "completions/mean_terminated_length": 291.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.6920915295062224, "frac_reward_zero_std": 1.0, "grad_norm": 0.05469439062659414, "kl": 0.002567291259765625, "learning_rate": 3.363784446473949e-07, "loss": 0.0001, "num_tokens": 9159528.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 268.625, "completions/mean_terminated_length": 268.625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.6923591596413756, "frac_reward_zero_std": 0.5, "grad_norm": 0.9483180570943518, "kl": 0.001766204833984375, "learning_rate": 3.3600855029924587e-07, "loss": -0.0208, "num_tokens": 9162733.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.6926267897765288, "frac_reward_zero_std": 1.0, "grad_norm": 0.01447802160099375, "kl": 0.00128936767578125, "learning_rate": 3.356388426932667e-07, "loss": 0.0001, "num_tokens": 9166110.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 270.0, "completions/max_terminated_length": 270.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.6928944199116821, "frac_reward_zero_std": 1.0, "grad_norm": 0.018367959449226666, "kl": 0.001117706298828125, "learning_rate": 3.3526932215208666e-07, "loss": 0.0, "num_tokens": 9168940.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 435.0, "completions/max_terminated_length": 435.0, "completions/mean_length": 321.875, "completions/mean_terminated_length": 321.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.6931620500468353, "frac_reward_zero_std": 1.0, "grad_norm": 0.022016154780166268, "kl": 0.00183868408203125, "learning_rate": 3.3489998899817234e-07, "loss": 0.0001, "num_tokens": 9172595.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 254.625, "completions/mean_terminated_length": 254.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.6934296801819885, "frac_reward_zero_std": 1.0, "grad_norm": 0.012472949853105673, "kl": 0.000713348388671875, "learning_rate": 3.3453084355382673e-07, "loss": 0.0, "num_tokens": 9175676.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.6936973103171417, "frac_reward_zero_std": 1.0, "grad_norm": 0.04015844345419198, "kl": 0.00197601318359375, "learning_rate": 3.341618861411887e-07, "loss": 0.0001, "num_tokens": 9178597.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 309.375, "completions/mean_terminated_length": 309.375, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.693964940452295, "frac_reward_zero_std": 1.0, "grad_norm": 0.03738907461900811, "kl": 0.00212860107421875, "learning_rate": 3.3379311708223393e-07, "loss": 0.0001, "num_tokens": 9182100.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 344.0, "completions/mean_terminated_length": 344.0, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.6942325705874481, "frac_reward_zero_std": 1.0, "grad_norm": 0.019377559006138467, "kl": 0.001697540283203125, "learning_rate": 3.3342453669877256e-07, "loss": 0.0001, "num_tokens": 9186176.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.6945002007226013, "frac_reward_zero_std": 1.0, "grad_norm": 0.014881051295654186, "kl": 0.00138092041015625, "learning_rate": 3.3305614531245075e-07, "loss": 0.0001, "num_tokens": 9189304.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 257.75, "completions/mean_terminated_length": 257.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.6947678308577546, "frac_reward_zero_std": 1.0, "grad_norm": 0.015261014595561432, "kl": 0.001285552978515625, "learning_rate": 3.326879432447496e-07, "loss": 0.0001, "num_tokens": 9192446.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 245.875, "completions/mean_terminated_length": 245.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.6950354609929078, "frac_reward_zero_std": 1.0, "grad_norm": 0.017443784978780727, "kl": 0.001209259033203125, "learning_rate": 3.3231993081698505e-07, "loss": 0.0, "num_tokens": 9195349.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 334.375, "completions/mean_terminated_length": 334.375, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.695303091128061, "frac_reward_zero_std": 1.0, "grad_norm": 0.013549494952055128, "kl": 0.00121307373046875, "learning_rate": 3.3195210835030744e-07, "loss": 0.0, "num_tokens": 9199248.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.6955707212632143, "frac_reward_zero_std": 1.0, "grad_norm": 0.013361532536284873, "kl": 0.00177001953125, "learning_rate": 3.315844761657014e-07, "loss": 0.0001, "num_tokens": 9202454.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 345.0, "completions/mean_terminated_length": 345.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.6958383513983675, "frac_reward_zero_std": 1.0, "grad_norm": 0.009306627524136416, "kl": 0.0009860992431640625, "learning_rate": 3.3121703458398566e-07, "loss": 0.0, "num_tokens": 9206314.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 354.5, "completions/mean_terminated_length": 354.5, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.6961059815335207, "frac_reward_zero_std": 1.0, "grad_norm": 0.027768863108973328, "kl": 0.00244140625, "learning_rate": 3.308497839258122e-07, "loss": 0.0001, "num_tokens": 9210258.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 316.25, "completions/mean_terminated_length": 316.25, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.6963736116686738, "frac_reward_zero_std": 0.5, "grad_norm": 0.7740743264593617, "kl": 0.001865386962890625, "learning_rate": 3.3048272451166684e-07, "loss": 0.0557, "num_tokens": 9213944.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 306.25, "completions/mean_terminated_length": 306.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.6966412418038271, "frac_reward_zero_std": 1.0, "grad_norm": 0.02651683545578048, "kl": 0.001575469970703125, "learning_rate": 3.3011585666186826e-07, "loss": 0.0001, "num_tokens": 9217542.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.6969088719389803, "frac_reward_zero_std": 0.5, "grad_norm": 0.8362156751085831, "kl": 0.00157928466796875, "learning_rate": 3.297491806965679e-07, "loss": 0.0146, "num_tokens": 9221093.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.6971765020741335, "frac_reward_zero_std": 1.0, "grad_norm": 0.007992929456988113, "kl": 0.0006313323974609375, "learning_rate": 3.2938269693575016e-07, "loss": 0.0, "num_tokens": 9224740.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 305.875, "completions/mean_terminated_length": 305.875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.6974441322092868, "frac_reward_zero_std": 0.5, "grad_norm": 1.2479792842519926, "kl": 0.00235748291015625, "learning_rate": 3.2901640569923147e-07, "loss": 0.0131, "num_tokens": 9228191.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 183.125, "completions/mean_terminated_length": 183.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.69771176234444, "frac_reward_zero_std": 1.0, "grad_norm": 0.020404348723748626, "kl": 0.001399993896484375, "learning_rate": 3.2865030730665957e-07, "loss": 0.0001, "num_tokens": 9230580.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 410.0, "completions/mean_terminated_length": 410.0, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.6979793924795932, "frac_reward_zero_std": 0.5, "grad_norm": 0.5012478719536805, "kl": 0.001003265380859375, "learning_rate": 3.282844020775154e-07, "loss": 0.0725, "num_tokens": 9235192.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.6982470226147465, "frac_reward_zero_std": 1.0, "grad_norm": 0.012578089776683224, "kl": 0.00122833251953125, "learning_rate": 3.279186903311097e-07, "loss": 0.0, "num_tokens": 9238827.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 654.0, "completions/max_terminated_length": 654.0, "completions/mean_length": 389.625, "completions/mean_terminated_length": 389.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6985146527498997, "frac_reward_zero_std": 1.0, "grad_norm": 0.010657818950293942, "kl": 0.0010051727294921875, "learning_rate": 3.2755317238658585e-07, "loss": 0.0, "num_tokens": 9243012.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.6987822828850528, "frac_reward_zero_std": 1.0, "grad_norm": 0.03187643456172726, "kl": 0.002315521240234375, "learning_rate": 3.2718784856291674e-07, "loss": 0.0001, "num_tokens": 9246274.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.699049913020206, "frac_reward_zero_std": 1.0, "grad_norm": 0.029169271708387966, "kl": 0.00141143798828125, "learning_rate": 3.2682271917890715e-07, "loss": 0.0001, "num_tokens": 9249547.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.6993175431553593, "frac_reward_zero_std": 1.0, "grad_norm": 0.02816997388042141, "kl": 0.0026092529296875, "learning_rate": 3.264577845531914e-07, "loss": 0.0001, "num_tokens": 9253074.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 246.0, "completions/mean_terminated_length": 246.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.6995851732905125, "frac_reward_zero_std": 1.0, "grad_norm": 0.013188680069061964, "kl": 0.0007610321044921875, "learning_rate": 3.2609304500423354e-07, "loss": 0.0, "num_tokens": 9255970.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.6998528034256657, "frac_reward_zero_std": 1.0, "grad_norm": 0.02604180589476459, "kl": 0.0023040771484375, "learning_rate": 3.2572850085032854e-07, "loss": 0.0001, "num_tokens": 9259737.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.700120433560819, "frac_reward_zero_std": 1.0, "grad_norm": 0.01848732197220927, "kl": 0.001251220703125, "learning_rate": 3.2536415240959953e-07, "loss": 0.0001, "num_tokens": 9262993.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 368.625, "completions/mean_terminated_length": 368.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7003880636959722, "frac_reward_zero_std": 1.0, "grad_norm": 0.015822372915758984, "kl": 0.001743316650390625, "learning_rate": 3.250000000000001e-07, "loss": 0.0001, "num_tokens": 9266878.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 275.625, "completions/mean_terminated_length": 275.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7006556938311254, "frac_reward_zero_std": 1.0, "grad_norm": 0.023433726464333917, "kl": 0.0014801025390625, "learning_rate": 3.2463604393931165e-07, "loss": 0.0001, "num_tokens": 9270067.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7009233239662787, "frac_reward_zero_std": 0.5, "grad_norm": 0.8015896993167163, "kl": 0.00159454345703125, "learning_rate": 3.24272284545145e-07, "loss": 0.0068, "num_tokens": 9273128.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 309.25, "completions/mean_terminated_length": 309.25, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.7011909541014318, "frac_reward_zero_std": 1.0, "grad_norm": 0.03233939160967734, "kl": 0.002056121826171875, "learning_rate": 3.2390872213493893e-07, "loss": 0.0001, "num_tokens": 9276762.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 348.625, "completions/mean_terminated_length": 252.1428680419922, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.701458584236585, "frac_reward_zero_std": 0.5, "grad_norm": 1.1222877764187018, "kl": 0.003143310546875, "learning_rate": 3.235453570259604e-07, "loss": 0.1555, "num_tokens": 9280635.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 290.5, "completions/mean_terminated_length": 290.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7017262143717382, "frac_reward_zero_std": 1.0, "grad_norm": 0.013447177757171221, "kl": 0.0014286041259765625, "learning_rate": 3.231821895353048e-07, "loss": 0.0001, "num_tokens": 9283883.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 329.5, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7019938445068915, "frac_reward_zero_std": 0.5, "grad_norm": 0.695475461653712, "kl": 0.000949859619140625, "learning_rate": 3.2281921997989404e-07, "loss": 0.0484, "num_tokens": 9287495.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 228.5, "completions/mean_terminated_length": 228.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7022614746420447, "frac_reward_zero_std": 1.0, "grad_norm": 0.01580186494926509, "kl": 0.00116729736328125, "learning_rate": 3.224564486764779e-07, "loss": 0.0, "num_tokens": 9290207.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 215.125, "completions/mean_terminated_length": 215.125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.7025291047771979, "frac_reward_zero_std": 1.0, "grad_norm": 0.0178806897987684, "kl": 0.00136566162109375, "learning_rate": 3.220938759416331e-07, "loss": 0.0001, "num_tokens": 9292948.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 337.0, "completions/mean_terminated_length": 337.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7027967349123512, "frac_reward_zero_std": 1.0, "grad_norm": 0.009263990081680287, "kl": 0.0008831024169921875, "learning_rate": 3.2173150209176295e-07, "loss": 0.0, "num_tokens": 9296604.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 319.25, "completions/mean_terminated_length": 319.25, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7030643650475044, "frac_reward_zero_std": 1.0, "grad_norm": 0.01572117745941552, "kl": 0.0016937255859375, "learning_rate": 3.213693274430974e-07, "loss": 0.0001, "num_tokens": 9300270.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 226.625, "completions/mean_terminated_length": 226.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.7033319951826575, "frac_reward_zero_std": 1.0, "grad_norm": 0.019771784694665732, "kl": 0.002025604248046875, "learning_rate": 3.2100735231169237e-07, "loss": 0.0001, "num_tokens": 9303119.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 262.625, "completions/mean_terminated_length": 262.625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7035996253178108, "frac_reward_zero_std": 1.0, "grad_norm": 0.01758166583667193, "kl": 0.0013885498046875, "learning_rate": 3.2064557701342965e-07, "loss": 0.0001, "num_tokens": 9306252.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.703867255452964, "frac_reward_zero_std": 1.0, "grad_norm": 0.026085651756127162, "kl": 0.001811981201171875, "learning_rate": 3.2028400186401703e-07, "loss": 0.0001, "num_tokens": 9309994.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 180.375, "completions/mean_terminated_length": 180.375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.7041348855881172, "frac_reward_zero_std": 1.0, "grad_norm": 0.040868771338055325, "kl": 0.001953125, "learning_rate": 3.1992262717898687e-07, "loss": 0.0001, "num_tokens": 9312405.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 220.875, "completions/mean_terminated_length": 220.875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7044025157232704, "frac_reward_zero_std": 1.0, "grad_norm": 0.027208666880963628, "kl": 0.0022125244140625, "learning_rate": 3.1956145327369743e-07, "loss": 0.0001, "num_tokens": 9315260.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.7046701458584237, "frac_reward_zero_std": 1.0, "grad_norm": 0.018084591148168752, "kl": 0.001495361328125, "learning_rate": 3.1920048046333125e-07, "loss": 0.0001, "num_tokens": 9318368.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7049377759935769, "frac_reward_zero_std": 0.5, "grad_norm": 0.7948541196629701, "kl": 0.0033111572265625, "learning_rate": 3.1883970906289566e-07, "loss": -0.0575, "num_tokens": 9321935.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 236.75, "completions/mean_terminated_length": 236.75, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7052054061287301, "frac_reward_zero_std": 1.0, "grad_norm": 0.01514869398099155, "kl": 0.0010528564453125, "learning_rate": 3.1847913938722194e-07, "loss": 0.0, "num_tokens": 9324817.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 370.25, "completions/mean_terminated_length": 370.25, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.7054730362638834, "frac_reward_zero_std": 0.5, "grad_norm": 0.7579191225793659, "kl": 0.00168609619140625, "learning_rate": 3.181187717509655e-07, "loss": 0.0233, "num_tokens": 9328815.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7057406663990365, "frac_reward_zero_std": 1.0, "grad_norm": 0.009753202809188174, "kl": 0.000888824462890625, "learning_rate": 3.1775860646860566e-07, "loss": 0.0, "num_tokens": 9332206.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 438.875, "completions/mean_terminated_length": 438.875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.7060082965341897, "frac_reward_zero_std": 1.0, "grad_norm": 0.01701007643862124, "kl": 0.001621246337890625, "learning_rate": 3.173986438544443e-07, "loss": 0.0001, "num_tokens": 9336985.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 260.625, "completions/mean_terminated_length": 260.625, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7062759266693429, "frac_reward_zero_std": 1.0, "grad_norm": 0.025087380805353283, "kl": 0.00162506103515625, "learning_rate": 3.170388842226079e-07, "loss": 0.0001, "num_tokens": 9340038.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 248.75, "completions/mean_terminated_length": 248.75, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7065435568044962, "frac_reward_zero_std": 1.0, "grad_norm": 0.01258943279779057, "kl": 0.00128936767578125, "learning_rate": 3.166793278870441e-07, "loss": 0.0001, "num_tokens": 9343072.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 348.125, "completions/mean_terminated_length": 348.125, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.7068111869396494, "frac_reward_zero_std": 1.0, "grad_norm": 0.012859447122011588, "kl": 0.0012054443359375, "learning_rate": 3.1631997516152456e-07, "loss": 0.0, "num_tokens": 9346881.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7070788170748026, "frac_reward_zero_std": 1.0, "grad_norm": 0.01719651231816304, "kl": 0.0015411376953125, "learning_rate": 3.1596082635964274e-07, "loss": 0.0001, "num_tokens": 9350383.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 499.0, "completions/max_terminated_length": 499.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.7073464472099559, "frac_reward_zero_std": 1.0, "grad_norm": 0.013859182772289248, "kl": 0.001369476318359375, "learning_rate": 3.156018817948135e-07, "loss": 0.0001, "num_tokens": 9353804.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 376.875, "completions/mean_terminated_length": 376.875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7076140773451091, "frac_reward_zero_std": 0.5, "grad_norm": 0.7285214196978572, "kl": 0.0032501220703125, "learning_rate": 3.1524314178027486e-07, "loss": 0.0606, "num_tokens": 9357911.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7078817074802622, "frac_reward_zero_std": 0.0, "grad_norm": 0.9993115962112142, "kl": 0.001842498779296875, "learning_rate": 3.148846066290847e-07, "loss": -0.0201, "num_tokens": 9360942.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 441.0, "completions/max_terminated_length": 441.0, "completions/mean_length": 324.25, "completions/mean_terminated_length": 324.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.7081493376154155, "frac_reward_zero_std": 1.0, "grad_norm": 0.020440303537852915, "kl": 0.0014190673828125, "learning_rate": 3.145262766541238e-07, "loss": 0.0001, "num_tokens": 9364804.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 246.25, "completions/mean_terminated_length": 246.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7084169677505687, "frac_reward_zero_std": 1.0, "grad_norm": 0.013242851084258286, "kl": 0.000728607177734375, "learning_rate": 3.1416815216809235e-07, "loss": 0.0, "num_tokens": 9367690.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 222.875, "completions/mean_terminated_length": 222.875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7086845978857219, "frac_reward_zero_std": 1.0, "grad_norm": 0.01966652172586972, "kl": 0.00152587890625, "learning_rate": 3.1381023348351227e-07, "loss": 0.0001, "num_tokens": 9370445.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7089522280208751, "frac_reward_zero_std": 0.5, "grad_norm": 1.2270960529513664, "kl": 0.001972198486328125, "learning_rate": 3.1345252091272546e-07, "loss": -0.0386, "num_tokens": 9373775.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 405.5, "completions/mean_terminated_length": 405.5, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.7092198581560284, "frac_reward_zero_std": 1.0, "grad_norm": 0.026410956711189124, "kl": 0.00237274169921875, "learning_rate": 3.130950147678938e-07, "loss": 0.0001, "num_tokens": 9378159.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.7094874882911816, "frac_reward_zero_std": 1.0, "grad_norm": 0.03281335158997296, "kl": 0.00214385986328125, "learning_rate": 3.127377153609998e-07, "loss": 0.0001, "num_tokens": 9381497.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 356.125, "completions/mean_terminated_length": 356.125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.7097551184263348, "frac_reward_zero_std": 1.0, "grad_norm": 0.014001230768722842, "kl": 0.00138092041015625, "learning_rate": 3.123806230038446e-07, "loss": 0.0001, "num_tokens": 9385498.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 354.375, "completions/mean_terminated_length": 354.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7100227485614881, "frac_reward_zero_std": 1.0, "grad_norm": 0.016476812426001672, "kl": 0.001056671142578125, "learning_rate": 3.1202373800804916e-07, "loss": 0.0, "num_tokens": 9389541.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7102903786966412, "frac_reward_zero_std": 1.0, "grad_norm": 0.021801789111641007, "kl": 0.001575469970703125, "learning_rate": 3.116670606850533e-07, "loss": 0.0001, "num_tokens": 9393414.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 293.625, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7105580088317944, "frac_reward_zero_std": 1.0, "grad_norm": 0.011289283134098856, "kl": 0.0007991790771484375, "learning_rate": 3.113105913461159e-07, "loss": 0.0, "num_tokens": 9396659.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 372.375, "completions/mean_terminated_length": 372.375, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7108256389669477, "frac_reward_zero_std": 1.0, "grad_norm": 0.011950753510994177, "kl": 0.001537322998046875, "learning_rate": 3.10954330302314e-07, "loss": 0.0001, "num_tokens": 9400910.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 388.0, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.7110932691021009, "frac_reward_zero_std": 1.0, "grad_norm": 0.014384350589430196, "kl": 0.0008716583251953125, "learning_rate": 3.1059827786454306e-07, "loss": 0.0, "num_tokens": 9405198.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 269.75, "completions/mean_terminated_length": 269.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7113608992372541, "frac_reward_zero_std": 1.0, "grad_norm": 0.014230861545612138, "kl": 0.001308441162109375, "learning_rate": 3.102424343435165e-07, "loss": 0.0001, "num_tokens": 9408336.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 198.375, "completions/mean_terminated_length": 198.375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7116285293724073, "frac_reward_zero_std": 1.0, "grad_norm": 0.013438231370882727, "kl": 0.0007696151733398438, "learning_rate": 3.098868000497653e-07, "loss": 0.0, "num_tokens": 9410831.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7118961595075606, "frac_reward_zero_std": 1.0, "grad_norm": 0.020850293787927493, "kl": 0.00127410888671875, "learning_rate": 3.0953137529363795e-07, "loss": 0.0001, "num_tokens": 9414323.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 259.0, "completions/mean_terminated_length": 259.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.7121637896427138, "frac_reward_zero_std": 1.0, "grad_norm": 0.022517525445559117, "kl": 0.0024871826171875, "learning_rate": 3.0917616038530005e-07, "loss": 0.0001, "num_tokens": 9417495.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.712431419777867, "frac_reward_zero_std": 1.0, "grad_norm": 0.01610961595912449, "kl": 0.001461029052734375, "learning_rate": 3.088211556347341e-07, "loss": 0.0001, "num_tokens": 9421305.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7126990499130202, "frac_reward_zero_std": 0.0, "grad_norm": 1.3587269877160704, "kl": 0.003631591796875, "learning_rate": 3.084663613517393e-07, "loss": -0.0626, "num_tokens": 9424010.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 411.875, "completions/mean_terminated_length": 411.875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.7129666800481734, "frac_reward_zero_std": 1.0, "grad_norm": 0.046069124835913196, "kl": 0.00264739990234375, "learning_rate": 3.081117778459309e-07, "loss": 0.0001, "num_tokens": 9428665.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 437.375, "completions/mean_terminated_length": 437.375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.7132343101833266, "frac_reward_zero_std": 1.0, "grad_norm": 0.03505752048911303, "kl": 0.00098419189453125, "learning_rate": 3.0775740542674055e-07, "loss": 0.0, "num_tokens": 9433328.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 253.25, "completions/mean_terminated_length": 253.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.7135019403184799, "frac_reward_zero_std": 0.5, "grad_norm": 0.824422886485238, "kl": 0.0023956298828125, "learning_rate": 3.0740324440341556e-07, "loss": -0.0796, "num_tokens": 9436282.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 299.375, "completions/mean_terminated_length": 299.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7137695704536331, "frac_reward_zero_std": 1.0, "grad_norm": 0.014196620887745757, "kl": 0.0012969970703125, "learning_rate": 3.070492950850183e-07, "loss": 0.0001, "num_tokens": 9439641.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 255.25, "completions/mean_terminated_length": 255.25, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7140372005887863, "frac_reward_zero_std": 1.0, "grad_norm": 0.03381322384597741, "kl": 0.002170562744140625, "learning_rate": 3.0669555778042756e-07, "loss": 0.0001, "num_tokens": 9442679.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 380.75, "completions/mean_terminated_length": 380.75, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.7143048307239395, "frac_reward_zero_std": 1.0, "grad_norm": 0.01078399510989344, "kl": 0.001007080078125, "learning_rate": 3.063420327983357e-07, "loss": 0.0, "num_tokens": 9446821.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 293.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7145724608590928, "frac_reward_zero_std": 1.0, "grad_norm": 0.04530620416187726, "kl": 0.003204345703125, "learning_rate": 3.059887204472508e-07, "loss": 0.0001, "num_tokens": 9450222.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 299.875, "completions/mean_terminated_length": 299.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.714840090994246, "frac_reward_zero_std": 1.0, "grad_norm": 0.017598442715278276, "kl": 0.00145721435546875, "learning_rate": 3.056356210354952e-07, "loss": 0.0001, "num_tokens": 9453601.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 380.375, "completions/mean_terminated_length": 380.375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7151077211293991, "frac_reward_zero_std": 0.5, "grad_norm": 0.5444722151278364, "kl": 0.001495361328125, "learning_rate": 3.0528273487120495e-07, "loss": -0.0358, "num_tokens": 9457796.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 376.0, "completions/mean_terminated_length": 376.0, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.7153753512645524, "frac_reward_zero_std": 1.0, "grad_norm": 0.011864757259135569, "kl": 0.0008029937744140625, "learning_rate": 3.0493006226233063e-07, "loss": 0.0, "num_tokens": 9461872.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7156429813997056, "frac_reward_zero_std": 1.0, "grad_norm": 0.013827210128642556, "kl": 0.001056671142578125, "learning_rate": 3.045776035166358e-07, "loss": 0.0, "num_tokens": 9465083.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 318.25, "completions/mean_terminated_length": 318.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.7159106115348588, "frac_reward_zero_std": 1.0, "grad_norm": 0.017006932147235886, "kl": 0.00162506103515625, "learning_rate": 3.0422535894169826e-07, "loss": 0.0001, "num_tokens": 9468689.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.7161782416700121, "frac_reward_zero_std": 1.0, "grad_norm": 0.012634760067068743, "kl": 0.0011806488037109375, "learning_rate": 3.0387332884490804e-07, "loss": 0.0, "num_tokens": 9472593.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 303.5, "completions/mean_terminated_length": 303.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7164458718051653, "frac_reward_zero_std": 1.0, "grad_norm": 0.013879397748936054, "kl": 0.0009975433349609375, "learning_rate": 3.0352151353346833e-07, "loss": 0.0, "num_tokens": 9476033.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.7167135019403185, "frac_reward_zero_std": 1.0, "grad_norm": 0.01578445778084049, "kl": 0.001495361328125, "learning_rate": 3.0316991331439506e-07, "loss": 0.0001, "num_tokens": 9479865.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 210.25, "completions/mean_terminated_length": 210.25, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7169811320754716, "frac_reward_zero_std": 1.0, "grad_norm": 0.021151097826974212, "kl": 0.0019989013671875, "learning_rate": 3.028185284945164e-07, "loss": 0.0001, "num_tokens": 9482583.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 359.25, "completions/mean_terminated_length": 359.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.717248762210625, "frac_reward_zero_std": 1.0, "grad_norm": 0.0170840325236837, "kl": 0.0010700225830078125, "learning_rate": 3.024673593804723e-07, "loss": 0.0, "num_tokens": 9486725.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.7175163923457781, "frac_reward_zero_std": 1.0, "grad_norm": 0.0619706763639474, "kl": 0.002685546875, "learning_rate": 3.021164062787147e-07, "loss": 0.0001, "num_tokens": 9490250.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.7177840224809313, "frac_reward_zero_std": 1.0, "grad_norm": 0.01429805281352421, "kl": 0.001102447509765625, "learning_rate": 3.0176566949550696e-07, "loss": 0.0, "num_tokens": 9493420.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7180516526160846, "frac_reward_zero_std": 0.5, "grad_norm": 1.4593301345894267, "kl": 0.001499176025390625, "learning_rate": 3.014151493369238e-07, "loss": 0.0147, "num_tokens": 9496510.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7183192827512378, "frac_reward_zero_std": 1.0, "grad_norm": 0.017282684965618193, "kl": 0.001560211181640625, "learning_rate": 3.0106484610885064e-07, "loss": 0.0001, "num_tokens": 9499338.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.718586912886391, "frac_reward_zero_std": 1.0, "grad_norm": 0.025280587011280865, "kl": 0.002227783203125, "learning_rate": 3.0071476011698383e-07, "loss": 0.0001, "num_tokens": 9502238.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7188545430215443, "frac_reward_zero_std": 1.0, "grad_norm": 0.02033716218803202, "kl": 0.002105712890625, "learning_rate": 3.0036489166682995e-07, "loss": 0.0001, "num_tokens": 9505528.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 263.75, "completions/mean_terminated_length": 263.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7191221731566975, "frac_reward_zero_std": 1.0, "grad_norm": 0.020385407138024485, "kl": 0.001621246337890625, "learning_rate": 3.000152410637059e-07, "loss": 0.0001, "num_tokens": 9508702.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 300.5, "completions/mean_terminated_length": 300.5, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7193898032918506, "frac_reward_zero_std": 1.0, "grad_norm": 0.023376165868109998, "kl": 0.0012264251708984375, "learning_rate": 2.9966580861273847e-07, "loss": 0.0, "num_tokens": 9512142.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 263.5, "completions/mean_terminated_length": 263.5, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.7196574334270038, "frac_reward_zero_std": 1.0, "grad_norm": 0.018950809588877574, "kl": 0.00154876708984375, "learning_rate": 2.993165946188639e-07, "loss": 0.0001, "num_tokens": 9515374.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 328.75, "completions/mean_terminated_length": 328.75, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.7199250635621571, "frac_reward_zero_std": 1.0, "grad_norm": 0.04111693514535339, "kl": 0.00249481201171875, "learning_rate": 2.98967599386828e-07, "loss": 0.0001, "num_tokens": 9519092.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 318.75, "completions/mean_terminated_length": 318.75, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.7201926936973103, "frac_reward_zero_std": 1.0, "grad_norm": 0.02107768705642283, "kl": 0.001041412353515625, "learning_rate": 2.986188232211856e-07, "loss": 0.0, "num_tokens": 9522754.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7204603238324635, "frac_reward_zero_std": 1.0, "grad_norm": 0.016243319296829664, "kl": 0.00153350830078125, "learning_rate": 2.9827026642630024e-07, "loss": 0.0001, "num_tokens": 9526777.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 497.0, "completions/max_terminated_length": 497.0, "completions/mean_length": 281.875, "completions/mean_terminated_length": 281.875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7207279539676168, "frac_reward_zero_std": 1.0, "grad_norm": 0.02032566160723728, "kl": 0.00168609619140625, "learning_rate": 2.9792192930634426e-07, "loss": 0.0001, "num_tokens": 9530028.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 364.875, "completions/mean_terminated_length": 364.875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.72099558410277, "frac_reward_zero_std": 1.0, "grad_norm": 0.011972232442994704, "kl": 0.00136566162109375, "learning_rate": 2.975738121652981e-07, "loss": 0.0001, "num_tokens": 9534107.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 229.625, "completions/mean_terminated_length": 229.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7212632142379232, "frac_reward_zero_std": 1.0, "grad_norm": 0.012186050129596109, "kl": 0.000957489013671875, "learning_rate": 2.972259153069504e-07, "loss": 0.0, "num_tokens": 9536840.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 268.0, "completions/mean_terminated_length": 268.0, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.7215308443730765, "frac_reward_zero_std": 1.0, "grad_norm": 0.013689637333059477, "kl": 0.001125335693359375, "learning_rate": 2.968782390348972e-07, "loss": 0.0, "num_tokens": 9540000.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 333.375, "completions/mean_terminated_length": 333.375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.7217984745082296, "frac_reward_zero_std": 1.0, "grad_norm": 0.010612561667379162, "kl": 0.0008907318115234375, "learning_rate": 2.9653078365254265e-07, "loss": 0.0, "num_tokens": 9543707.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7220661046433828, "frac_reward_zero_std": 1.0, "grad_norm": 0.013245266894310633, "kl": 0.001251220703125, "learning_rate": 2.961835494630973e-07, "loss": 0.0001, "num_tokens": 9547271.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.722333734778536, "frac_reward_zero_std": 1.0, "grad_norm": 0.01223237791061234, "kl": 0.00128173828125, "learning_rate": 2.958365367695798e-07, "loss": 0.0001, "num_tokens": 9550903.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 395.5, "completions/mean_terminated_length": 395.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.7226013649136893, "frac_reward_zero_std": 0.5, "grad_norm": 0.5555484705433985, "kl": 0.00119781494140625, "learning_rate": 2.9548974587481466e-07, "loss": 0.0495, "num_tokens": 9555219.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 428.75, "completions/mean_terminated_length": 428.75, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.7228689950488425, "frac_reward_zero_std": 1.0, "grad_norm": 0.012167012446311078, "kl": 0.00121307373046875, "learning_rate": 2.951431770814327e-07, "loss": 0.0, "num_tokens": 9559641.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7231366251839957, "frac_reward_zero_std": 1.0, "grad_norm": 0.018667964357969098, "kl": 0.001422882080078125, "learning_rate": 2.9479683069187187e-07, "loss": 0.0001, "num_tokens": 9563006.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 467.0, "completions/mean_terminated_length": 467.0, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.723404255319149, "frac_reward_zero_std": 0.5, "grad_norm": 0.8820647065596365, "kl": 0.001953125, "learning_rate": 2.9445070700837484e-07, "loss": 0.0753, "num_tokens": 9568190.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7236718854543022, "frac_reward_zero_std": 1.0, "grad_norm": 0.02866531695358928, "kl": 0.002429962158203125, "learning_rate": 2.941048063329912e-07, "loss": 0.0001, "num_tokens": 9571813.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 251.0, "completions/max_terminated_length": 251.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7239395155894554, "frac_reward_zero_std": 1.0, "grad_norm": 0.022931313471927343, "kl": 0.00152587890625, "learning_rate": 2.937591289675746e-07, "loss": 0.0001, "num_tokens": 9574872.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 455.0, "completions/max_terminated_length": 455.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7242071457246085, "frac_reward_zero_std": 0.5, "grad_norm": 0.724550759226772, "kl": 0.00217437744140625, "learning_rate": 2.934136752137849e-07, "loss": -0.0049, "num_tokens": 9578293.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 391.75, "completions/mean_terminated_length": 391.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7244747758597618, "frac_reward_zero_std": 1.0, "grad_norm": 0.009716200154859519, "kl": 0.000873565673828125, "learning_rate": 2.930684453730862e-07, "loss": 0.0, "num_tokens": 9582439.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 279.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.724742405994915, "frac_reward_zero_std": 1.0, "grad_norm": 0.02512406156005209, "kl": 0.001407623291015625, "learning_rate": 2.927234397467475e-07, "loss": 0.0001, "num_tokens": 9585576.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 316.875, "completions/mean_terminated_length": 316.875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.7250100361300682, "frac_reward_zero_std": 1.0, "grad_norm": 0.017815495524141113, "kl": 0.001468658447265625, "learning_rate": 2.92378658635842e-07, "loss": 0.0001, "num_tokens": 9589127.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 341.5, "completions/mean_terminated_length": 341.5, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.7252776662652215, "frac_reward_zero_std": 1.0, "grad_norm": 0.015380984734769348, "kl": 0.001613616943359375, "learning_rate": 2.9203410234124713e-07, "loss": 0.0001, "num_tokens": 9592855.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 233.75, "completions/mean_terminated_length": 233.75, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.7255452964003747, "frac_reward_zero_std": 1.0, "grad_norm": 0.01663383474751637, "kl": 0.00141143798828125, "learning_rate": 2.9168977116364386e-07, "loss": 0.0001, "num_tokens": 9595885.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7258129265355279, "frac_reward_zero_std": 1.0, "grad_norm": 0.013629302250193842, "kl": 0.001201629638671875, "learning_rate": 2.913456654035169e-07, "loss": 0.0, "num_tokens": 9599370.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 373.875, "completions/mean_terminated_length": 373.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7260805566706812, "frac_reward_zero_std": 1.0, "grad_norm": 0.019767263413406864, "kl": 0.0010204315185546875, "learning_rate": 2.910017853611544e-07, "loss": 0.0, "num_tokens": 9603413.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.7263481868058344, "frac_reward_zero_std": 1.0, "grad_norm": 0.029493370191152293, "kl": 0.0015716552734375, "learning_rate": 2.90658131336647e-07, "loss": 0.0001, "num_tokens": 9607415.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.7266158169409875, "frac_reward_zero_std": 1.0, "grad_norm": 0.012466371921238105, "kl": 0.0010318756103515625, "learning_rate": 2.9031470362988877e-07, "loss": 0.0, "num_tokens": 9610576.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 385.625, "completions/mean_terminated_length": 385.625, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.7268834470761407, "frac_reward_zero_std": 0.5, "grad_norm": 0.8714895089216101, "kl": 0.0013885498046875, "learning_rate": 2.8997150254057573e-07, "loss": 0.05, "num_tokens": 9614773.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 267.0, "completions/mean_terminated_length": 267.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.727151077211294, "frac_reward_zero_std": 1.0, "grad_norm": 0.02165443208608621, "kl": 0.001804351806640625, "learning_rate": 2.896285283682065e-07, "loss": 0.0001, "num_tokens": 9618121.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7274187073464472, "frac_reward_zero_std": 0.5, "grad_norm": 0.4790655718222041, "kl": 0.000858306884765625, "learning_rate": 2.892857814120815e-07, "loss": 0.0586, "num_tokens": 9621731.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7276863374816004, "frac_reward_zero_std": 1.0, "grad_norm": 0.012165203175061957, "kl": 0.001094818115234375, "learning_rate": 2.889432619713029e-07, "loss": 0.0, "num_tokens": 9624734.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 393.75, "completions/mean_terminated_length": 393.75, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7279539676167537, "frac_reward_zero_std": 0.5, "grad_norm": 0.6321724822627943, "kl": 0.0036163330078125, "learning_rate": 2.886009703447745e-07, "loss": 0.0064, "num_tokens": 9629100.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 292.25, "completions/mean_terminated_length": 292.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7282215977519069, "frac_reward_zero_std": 1.0, "grad_norm": 0.023970357459335827, "kl": 0.0014495849609375, "learning_rate": 2.882589068312009e-07, "loss": 0.0001, "num_tokens": 9632494.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 257.375, "completions/mean_terminated_length": 257.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.72848922788706, "frac_reward_zero_std": 1.0, "grad_norm": 0.016580878966321575, "kl": 0.001331329345703125, "learning_rate": 2.8791707172908797e-07, "loss": 0.0001, "num_tokens": 9635493.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7287568580222133, "frac_reward_zero_std": 1.0, "grad_norm": 0.016406295435987994, "kl": 0.0016632080078125, "learning_rate": 2.875754653367422e-07, "loss": 0.0001, "num_tokens": 9638668.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 371.25, "completions/mean_terminated_length": 371.25, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7290244881573665, "frac_reward_zero_std": 0.5, "grad_norm": 0.8657480306298224, "kl": 0.002010345458984375, "learning_rate": 2.872340879522706e-07, "loss": 0.0915, "num_tokens": 9642678.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.7292921182925197, "frac_reward_zero_std": 1.0, "grad_norm": 0.013861341189334409, "kl": 0.0007457733154296875, "learning_rate": 2.8689293987357967e-07, "loss": 0.0, "num_tokens": 9646243.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 370.5, "completions/mean_terminated_length": 370.5, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.7295597484276729, "frac_reward_zero_std": 1.0, "grad_norm": 0.02046049076052986, "kl": 0.001495361328125, "learning_rate": 2.8655202139837697e-07, "loss": 0.0001, "num_tokens": 9650619.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 249.875, "completions/mean_terminated_length": 249.875, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7298273785628262, "frac_reward_zero_std": 1.0, "grad_norm": 0.04424226239221894, "kl": 0.00232696533203125, "learning_rate": 2.8621133282416833e-07, "loss": 0.0001, "num_tokens": 9653586.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 365.75, "completions/mean_terminated_length": 365.75, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7300950086979794, "frac_reward_zero_std": 1.0, "grad_norm": 0.01640448241748055, "kl": 0.0016326904296875, "learning_rate": 2.8587087444826037e-07, "loss": 0.0001, "num_tokens": 9657652.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 456.875, "completions/mean_terminated_length": 456.875, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.7303626388331326, "frac_reward_zero_std": 1.0, "grad_norm": 0.011516704753881703, "kl": 0.0010242462158203125, "learning_rate": 2.8553064656775767e-07, "loss": 0.0, "num_tokens": 9662491.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 222.625, "completions/mean_terminated_length": 222.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7306302689682859, "frac_reward_zero_std": 1.0, "grad_norm": 0.016098918864395538, "kl": 0.00096893310546875, "learning_rate": 2.85190649479564e-07, "loss": 0.0, "num_tokens": 9665312.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.730897899103439, "frac_reward_zero_std": 1.0, "grad_norm": 0.0243578468428686, "kl": 0.001682281494140625, "learning_rate": 2.8485088348038244e-07, "loss": 0.0001, "num_tokens": 9669136.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 320.0, "completions/mean_terminated_length": 320.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7311655292385922, "frac_reward_zero_std": 1.0, "grad_norm": 0.016842060049361764, "kl": 0.001117706298828125, "learning_rate": 2.8451134886671314e-07, "loss": 0.0, "num_tokens": 9672668.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 432.5, "completions/mean_terminated_length": 432.5, "completions/min_length": 335.0, "completions/min_terminated_length": 335.0, "epoch": 0.7314331593737455, "frac_reward_zero_std": 0.5, "grad_norm": 0.6373851464171828, "kl": 0.001522064208984375, "learning_rate": 2.8417204593485564e-07, "loss": 0.0001, "num_tokens": 9677464.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 531.0, "completions/max_terminated_length": 531.0, "completions/mean_length": 310.375, "completions/mean_terminated_length": 310.375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7317007895088987, "frac_reward_zero_std": 1.0, "grad_norm": 0.011951339173902457, "kl": 0.0006732940673828125, "learning_rate": 2.8383297498090614e-07, "loss": 0.0, "num_tokens": 9680927.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 327.25, "completions/mean_terminated_length": 327.25, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.7319684196440519, "frac_reward_zero_std": 0.5, "grad_norm": 0.9253989734551052, "kl": 0.0018463134765625, "learning_rate": 2.8349413630075903e-07, "loss": 0.0725, "num_tokens": 9684625.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.7322360497792051, "frac_reward_zero_std": 1.0, "grad_norm": 0.013266649178057319, "kl": 0.000823974609375, "learning_rate": 2.8315553019010605e-07, "loss": 0.0, "num_tokens": 9688093.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 316.125, "completions/mean_terminated_length": 316.125, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7325036799143584, "frac_reward_zero_std": 0.5, "grad_norm": 0.9302781213147919, "kl": 0.00188446044921875, "learning_rate": 2.8281715694443564e-07, "loss": 0.0267, "num_tokens": 9691898.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 308.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7327713100495116, "frac_reward_zero_std": 0.5, "grad_norm": 0.8203177904778524, "kl": 0.0005207061767578125, "learning_rate": 2.824790168590334e-07, "loss": 0.0027, "num_tokens": 9695317.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7330389401846648, "frac_reward_zero_std": 1.0, "grad_norm": 0.017995645069363974, "kl": 0.001087188720703125, "learning_rate": 2.82141110228981e-07, "loss": 0.0, "num_tokens": 9697957.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 252.5, "completions/mean_terminated_length": 252.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.733306570319818, "frac_reward_zero_std": 1.0, "grad_norm": 0.02000328540189195, "kl": 0.001163482666015625, "learning_rate": 2.8180343734915717e-07, "loss": 0.0, "num_tokens": 9700985.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 228.0, "completions/mean_terminated_length": 228.0, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7335742004549712, "frac_reward_zero_std": 1.0, "grad_norm": 0.03802463775242428, "kl": 0.0022430419921875, "learning_rate": 2.814659985142356e-07, "loss": 0.0001, "num_tokens": 9703785.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 276.75, "completions/mean_terminated_length": 276.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.7338418305901244, "frac_reward_zero_std": 1.0, "grad_norm": 0.016998581517883735, "kl": 0.001407623291015625, "learning_rate": 2.8112879401868655e-07, "loss": 0.0001, "num_tokens": 9706999.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 322.125, "completions/mean_terminated_length": 322.125, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.7341094607252777, "frac_reward_zero_std": 0.5, "grad_norm": 0.6826825904676936, "kl": 0.0014190673828125, "learning_rate": 2.807918241567754e-07, "loss": -0.0261, "num_tokens": 9710616.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 280.0, "completions/mean_terminated_length": 280.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.7343770908604309, "frac_reward_zero_std": 1.0, "grad_norm": 0.010696731693598934, "kl": 0.0007762908935546875, "learning_rate": 2.80455089222563e-07, "loss": 0.0, "num_tokens": 9713884.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 483.0, "completions/max_terminated_length": 483.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7346447209955841, "frac_reward_zero_std": 1.0, "grad_norm": 0.012005578619943696, "kl": 0.000919342041015625, "learning_rate": 2.8011858950990495e-07, "loss": 0.0, "num_tokens": 9717828.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 330.625, "completions/mean_terminated_length": 330.625, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.7349123511307373, "frac_reward_zero_std": 1.0, "grad_norm": 0.012018835099077951, "kl": 0.0010528564453125, "learning_rate": 2.797823253124518e-07, "loss": 0.0, "num_tokens": 9721433.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.7351799812658906, "frac_reward_zero_std": 1.0, "grad_norm": 0.02277479755018531, "kl": 0.001758575439453125, "learning_rate": 2.794462969236485e-07, "loss": 0.0001, "num_tokens": 9725254.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 218.75, "completions/mean_terminated_length": 218.75, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.7354476114010438, "frac_reward_zero_std": 1.0, "grad_norm": 0.020619653592285004, "kl": 0.00244140625, "learning_rate": 2.791105046367341e-07, "loss": 0.0001, "num_tokens": 9727872.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 219.25, "completions/mean_terminated_length": 219.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.7357152415361969, "frac_reward_zero_std": 1.0, "grad_norm": 0.013978252920396375, "kl": 0.0013275146484375, "learning_rate": 2.787749487447417e-07, "loss": 0.0001, "num_tokens": 9730634.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 278.5, "completions/mean_terminated_length": 278.5, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.7359828716713502, "frac_reward_zero_std": 1.0, "grad_norm": 0.012283061342446456, "kl": 0.0008411407470703125, "learning_rate": 2.7843962954049826e-07, "loss": 0.0, "num_tokens": 9733922.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 403.875, "completions/mean_terminated_length": 403.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7362505018065034, "frac_reward_zero_std": 0.5, "grad_norm": 9.325964675257993, "kl": 0.12653350830078125, "learning_rate": 2.781045473166239e-07, "loss": 0.0426, "num_tokens": 9738317.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7365181319416566, "frac_reward_zero_std": 1.0, "grad_norm": 0.012570227571520105, "kl": 0.001125335693359375, "learning_rate": 2.7776970236553215e-07, "loss": 0.0, "num_tokens": 9741923.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 391.375, "completions/mean_terminated_length": 391.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7367857620768099, "frac_reward_zero_std": 0.5, "grad_norm": 0.8091135030044168, "kl": 0.00275421142578125, "learning_rate": 2.774350949794296e-07, "loss": 0.0042, "num_tokens": 9746094.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 371.875, "completions/mean_terminated_length": 371.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.7370533922119631, "frac_reward_zero_std": 0.5, "grad_norm": 0.7616592757678214, "kl": 0.0019989013671875, "learning_rate": 2.771007254503149e-07, "loss": 0.0253, "num_tokens": 9750385.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 255.75, "completions/mean_terminated_length": 255.75, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.7373210223471163, "frac_reward_zero_std": 1.0, "grad_norm": 0.0395961053666954, "kl": 0.00384521484375, "learning_rate": 2.7676659406998005e-07, "loss": 0.0002, "num_tokens": 9753579.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.7375886524822695, "frac_reward_zero_std": 1.0, "grad_norm": 0.016642994118633507, "kl": 0.001171112060546875, "learning_rate": 2.764327011300083e-07, "loss": 0.0, "num_tokens": 9757478.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7378562826174228, "frac_reward_zero_std": 1.0, "grad_norm": 0.014767408401978726, "kl": 0.0009136199951171875, "learning_rate": 2.760990469217758e-07, "loss": 0.0, "num_tokens": 9761050.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 337.75, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7381239127525759, "frac_reward_zero_std": 0.5, "grad_norm": 0.6320836485258572, "kl": 0.001316070556640625, "learning_rate": 2.757656317364493e-07, "loss": -0.01, "num_tokens": 9764940.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 373.0, "completions/mean_terminated_length": 373.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.7383915428877291, "frac_reward_zero_std": 1.0, "grad_norm": 0.012938852437275545, "kl": 0.00125885009765625, "learning_rate": 2.7543245586498765e-07, "loss": 0.0001, "num_tokens": 9769000.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 259.125, "completions/mean_terminated_length": 259.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7386591730228824, "frac_reward_zero_std": 1.0, "grad_norm": 0.01591086671468018, "kl": 0.0010833740234375, "learning_rate": 2.7509951959814113e-07, "loss": 0.0, "num_tokens": 9772021.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7389268031580356, "frac_reward_zero_std": 1.0, "grad_norm": 0.015518494119257533, "kl": 0.0008945465087890625, "learning_rate": 2.7476682322644995e-07, "loss": 0.0, "num_tokens": 9775492.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 319.375, "completions/mean_terminated_length": 319.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.7391944332931888, "frac_reward_zero_std": 0.5, "grad_norm": 0.5604727378534453, "kl": 0.0010585784912109375, "learning_rate": 2.7443436704024604e-07, "loss": 0.0284, "num_tokens": 9779151.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 240.5, "completions/mean_terminated_length": 240.5, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.7394620634283421, "frac_reward_zero_std": 1.0, "grad_norm": 0.01670190960623932, "kl": 0.001117706298828125, "learning_rate": 2.741021513296507e-07, "loss": 0.0, "num_tokens": 9782047.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 350.25, "completions/mean_terminated_length": 350.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.7397296935634953, "frac_reward_zero_std": 1.0, "grad_norm": 0.016761737137820143, "kl": 0.0010356903076171875, "learning_rate": 2.7377017638457646e-07, "loss": 0.0, "num_tokens": 9785889.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 257.0, "completions/mean_terminated_length": 257.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.7399973236986485, "frac_reward_zero_std": 1.0, "grad_norm": 0.018794131023582503, "kl": 0.001232147216796875, "learning_rate": 2.7343844249472463e-07, "loss": 0.0, "num_tokens": 9789005.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 240.75, "completions/mean_terminated_length": 240.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7402649538338016, "frac_reward_zero_std": 1.0, "grad_norm": 0.028757688237716232, "kl": 0.00196075439453125, "learning_rate": 2.7310694994958713e-07, "loss": 0.0001, "num_tokens": 9791783.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 351.125, "completions/mean_terminated_length": 351.125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7405325839689549, "frac_reward_zero_std": 0.5, "grad_norm": 1.1277296158939039, "kl": 0.00223541259765625, "learning_rate": 2.7277569903844455e-07, "loss": 0.021, "num_tokens": 9795648.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.7408002141041081, "frac_reward_zero_std": 1.0, "grad_norm": 0.02812005364853488, "kl": 0.001766204833984375, "learning_rate": 2.7244469005036695e-07, "loss": 0.0001, "num_tokens": 9799327.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.7410678442392613, "frac_reward_zero_std": 1.0, "grad_norm": 0.011796608617274194, "kl": 0.0009555816650390625, "learning_rate": 2.721139232742137e-07, "loss": 0.0, "num_tokens": 9803557.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 337.125, "completions/mean_terminated_length": 337.125, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.7413354743744146, "frac_reward_zero_std": 1.0, "grad_norm": 0.018941328477150902, "kl": 0.001964569091796875, "learning_rate": 2.717833989986318e-07, "loss": 0.0001, "num_tokens": 9807270.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 278.0, "completions/mean_terminated_length": 278.0, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.7416031045095678, "frac_reward_zero_std": 1.0, "grad_norm": 0.012303660290876083, "kl": 0.000919342041015625, "learning_rate": 2.714531175120573e-07, "loss": 0.0, "num_tokens": 9810646.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 207.5, "completions/mean_terminated_length": 207.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.741870734644721, "frac_reward_zero_std": 1.0, "grad_norm": 0.024550538228148833, "kl": 0.00135040283203125, "learning_rate": 2.7112307910271435e-07, "loss": 0.0001, "num_tokens": 9813386.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.7421383647798742, "frac_reward_zero_std": 1.0, "grad_norm": 0.01790858034177775, "kl": 0.00173187255859375, "learning_rate": 2.707932840586148e-07, "loss": 0.0001, "num_tokens": 9816640.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 216.5, "completions/mean_terminated_length": 216.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.7424059949150275, "frac_reward_zero_std": 1.0, "grad_norm": 0.033541157724320585, "kl": 0.001583099365234375, "learning_rate": 2.704637326675583e-07, "loss": 0.0001, "num_tokens": 9819364.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 276.75, "completions/mean_terminated_length": 276.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.7426736250501806, "frac_reward_zero_std": 1.0, "grad_norm": 0.02573809392892229, "kl": 0.00254058837890625, "learning_rate": 2.7013442521713156e-07, "loss": 0.0001, "num_tokens": 9822518.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 321.0, "completions/mean_terminated_length": 321.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7429412551853338, "frac_reward_zero_std": 1.0, "grad_norm": 0.020496705580757748, "kl": 0.001697540283203125, "learning_rate": 2.6980536199470876e-07, "loss": 0.0001, "num_tokens": 9826546.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 871.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 398.125, "completions/mean_terminated_length": 398.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7432088853204871, "frac_reward_zero_std": 0.5, "grad_norm": 0.8437561502841869, "kl": 0.001964569091796875, "learning_rate": 2.6947654328745096e-07, "loss": 0.0001, "num_tokens": 9830667.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 269.0, "completions/max_terminated_length": 269.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7434765154556403, "frac_reward_zero_std": 1.0, "grad_norm": 0.024259533082342796, "kl": 0.00254058837890625, "learning_rate": 2.691479693823053e-07, "loss": 0.0001, "num_tokens": 9833493.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 263.125, "completions/mean_terminated_length": 263.125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7437441455907935, "frac_reward_zero_std": 1.0, "grad_norm": 0.014791070434734986, "kl": 0.001491546630859375, "learning_rate": 2.6881964056600596e-07, "loss": 0.0001, "num_tokens": 9836538.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 379.75, "completions/mean_terminated_length": 379.75, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.7440117757259468, "frac_reward_zero_std": 1.0, "grad_norm": 0.013200158341123947, "kl": 0.00152587890625, "learning_rate": 2.6849155712507297e-07, "loss": 0.0001, "num_tokens": 9840716.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 310.0, "completions/max_terminated_length": 310.0, "completions/mean_length": 290.75, "completions/mean_terminated_length": 290.75, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.7442794058611, "frac_reward_zero_std": 0.5, "grad_norm": 1.4599920750512632, "kl": 0.00310516357421875, "learning_rate": 2.6816371934581224e-07, "loss": -0.0043, "num_tokens": 9844102.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 334.0, "completions/mean_terminated_length": 334.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.7445470359962532, "frac_reward_zero_std": 1.0, "grad_norm": 0.019708678152878967, "kl": 0.00203704833984375, "learning_rate": 2.6783612751431526e-07, "loss": 0.0001, "num_tokens": 9847990.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 357.5, "completions/mean_terminated_length": 357.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.7448146661314063, "frac_reward_zero_std": 1.0, "grad_norm": 0.015833214134518468, "kl": 0.0013885498046875, "learning_rate": 2.6750878191645864e-07, "loss": 0.0001, "num_tokens": 9852022.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 454.875, "completions/mean_terminated_length": 454.875, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 0.7450822962665596, "frac_reward_zero_std": 1.0, "grad_norm": 0.13076879500732433, "kl": 0.00119781494140625, "learning_rate": 2.67181682837905e-07, "loss": 0.0, "num_tokens": 9856685.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 359.125, "completions/mean_terminated_length": 359.125, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7453499264017128, "frac_reward_zero_std": 1.0, "grad_norm": 0.034209967869130976, "kl": 0.0013885498046875, "learning_rate": 2.668548305641004e-07, "loss": 0.0001, "num_tokens": 9860646.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 264.0, "completions/max_terminated_length": 264.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.745617556536866, "frac_reward_zero_std": 1.0, "grad_norm": 0.044952008778352495, "kl": 0.001941680908203125, "learning_rate": 2.665282253802772e-07, "loss": 0.0001, "num_tokens": 9863223.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7458851866720193, "frac_reward_zero_std": 1.0, "grad_norm": 0.029316514753214836, "kl": 0.00262451171875, "learning_rate": 2.662018675714505e-07, "loss": 0.0001, "num_tokens": 9866597.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 280.875, "completions/mean_terminated_length": 280.875, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7461528168071725, "frac_reward_zero_std": 1.0, "grad_norm": 0.011841806242656032, "kl": 0.0009765625, "learning_rate": 2.6587575742242085e-07, "loss": 0.0, "num_tokens": 9869736.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 406.625, "completions/mean_terminated_length": 406.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7464204469423257, "frac_reward_zero_std": 1.0, "grad_norm": 0.009270869055505788, "kl": 0.001026153564453125, "learning_rate": 2.6554989521777205e-07, "loss": 0.0, "num_tokens": 9874061.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.746688077077479, "frac_reward_zero_std": 1.0, "grad_norm": 0.010581245282580403, "kl": 0.00069427490234375, "learning_rate": 2.652242812418712e-07, "loss": 0.0, "num_tokens": 9877285.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 431.0, "completions/mean_terminated_length": 431.0, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.7469557072126322, "frac_reward_zero_std": 1.0, "grad_norm": 0.016325000568972202, "kl": 0.0008945465087890625, "learning_rate": 2.648989157788699e-07, "loss": 0.0, "num_tokens": 9881681.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 351.0, "completions/max_terminated_length": 351.0, "completions/mean_length": 262.0, "completions/mean_terminated_length": 262.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7472233373477853, "frac_reward_zero_std": 1.0, "grad_norm": 0.019593385723485582, "kl": 0.00112152099609375, "learning_rate": 2.6457379911270134e-07, "loss": 0.0, "num_tokens": 9884865.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 298.5, "completions/mean_terminated_length": 298.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7474909674829385, "frac_reward_zero_std": 1.0, "grad_norm": 0.018888475027894064, "kl": 0.001377105712890625, "learning_rate": 2.642489315270832e-07, "loss": 0.0001, "num_tokens": 9888289.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.7477585976180918, "frac_reward_zero_std": 1.0, "grad_norm": 0.04704246660842775, "kl": 0.00301361083984375, "learning_rate": 2.6392431330551443e-07, "loss": 0.0001, "num_tokens": 9891507.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 596.0, "completions/max_terminated_length": 596.0, "completions/mean_length": 321.75, "completions/mean_terminated_length": 321.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.748026227753245, "frac_reward_zero_std": 1.0, "grad_norm": 0.021268792266755945, "kl": 0.0015869140625, "learning_rate": 2.635999447312773e-07, "loss": 0.0001, "num_tokens": 9895017.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 299.125, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.7482938578883982, "frac_reward_zero_std": 1.0, "grad_norm": 0.028903821244455855, "kl": 0.00203704833984375, "learning_rate": 2.632758260874358e-07, "loss": 0.0001, "num_tokens": 9898462.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 259.625, "completions/mean_terminated_length": 259.625, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7485614880235515, "frac_reward_zero_std": 1.0, "grad_norm": 0.018005638285798972, "kl": 0.00174713134765625, "learning_rate": 2.6295195765683563e-07, "loss": 0.0001, "num_tokens": 9901659.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 270.25, "completions/mean_terminated_length": 270.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7488291181587047, "frac_reward_zero_std": 1.0, "grad_norm": 0.01800897353570929, "kl": 0.00125885009765625, "learning_rate": 2.62628339722105e-07, "loss": 0.0001, "num_tokens": 9904873.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 530.0, "completions/max_terminated_length": 530.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 362.625, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.7490967482938579, "frac_reward_zero_std": 0.5, "grad_norm": 0.7947529270934204, "kl": 0.00298309326171875, "learning_rate": 2.623049725656523e-07, "loss": 0.007, "num_tokens": 9909062.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 310.5, "completions/mean_terminated_length": 310.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7493643784290112, "frac_reward_zero_std": 1.0, "grad_norm": 0.022448232424380148, "kl": 0.0018463134765625, "learning_rate": 2.619818564696682e-07, "loss": 0.0001, "num_tokens": 9912742.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 329.375, "completions/mean_terminated_length": 329.375, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.7496320085641643, "frac_reward_zero_std": 0.5, "grad_norm": 1.0397856125710494, "kl": 0.00258636474609375, "learning_rate": 2.6165899171612344e-07, "loss": 0.018, "num_tokens": 9916497.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 442.375, "completions/mean_terminated_length": 442.375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.7498996386993175, "frac_reward_zero_std": 0.5, "grad_norm": 0.5147059943236658, "kl": 0.000995635986328125, "learning_rate": 2.6133637858676987e-07, "loss": 0.0, "num_tokens": 9921240.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 341.75, "completions/mean_terminated_length": 341.75, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.7501672688344707, "frac_reward_zero_std": 1.0, "grad_norm": 0.013336380783575996, "kl": 0.001026153564453125, "learning_rate": 2.6101401736313975e-07, "loss": 0.0, "num_tokens": 9925094.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 302.875, "completions/mean_terminated_length": 302.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.750434898969624, "frac_reward_zero_std": 1.0, "grad_norm": 0.026204617226878703, "kl": 0.002044677734375, "learning_rate": 2.606919083265453e-07, "loss": 0.0001, "num_tokens": 9928657.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 363.875, "completions/mean_terminated_length": 363.875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.7507025291047772, "frac_reward_zero_std": 1.0, "grad_norm": 0.03653174952874579, "kl": 0.002777099609375, "learning_rate": 2.603700517580788e-07, "loss": 0.0001, "num_tokens": 9932664.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 378.75, "completions/mean_terminated_length": 378.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7509701592399304, "frac_reward_zero_std": 1.0, "grad_norm": 0.029956892853189598, "kl": 0.002399444580078125, "learning_rate": 2.600484479386125e-07, "loss": 0.0001, "num_tokens": 9936942.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7512377893750837, "frac_reward_zero_std": 1.0, "grad_norm": 0.02988982573986796, "kl": 0.00312042236328125, "learning_rate": 2.59727097148797e-07, "loss": 0.0001, "num_tokens": 9940423.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 368.875, "completions/mean_terminated_length": 368.875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.7515054195102369, "frac_reward_zero_std": 1.0, "grad_norm": 0.025523993259457787, "kl": 0.002124786376953125, "learning_rate": 2.594059996690636e-07, "loss": 0.0001, "num_tokens": 9944446.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 270.125, "completions/mean_terminated_length": 270.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.75177304964539, "frac_reward_zero_std": 1.0, "grad_norm": 0.12808914031447002, "kl": 0.00650787353515625, "learning_rate": 2.5908515577962154e-07, "loss": 0.0003, "num_tokens": 9947603.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 425.5, "completions/mean_terminated_length": 425.5, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.7520406797805433, "frac_reward_zero_std": 0.5, "grad_norm": 0.8837749638001575, "kl": 0.00205230712890625, "learning_rate": 2.5876456576045904e-07, "loss": 0.0113, "num_tokens": 9952187.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 311.5, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7523083099156965, "frac_reward_zero_std": 1.0, "grad_norm": 0.03722278057762776, "kl": 0.002593994140625, "learning_rate": 2.584442298913429e-07, "loss": 0.0001, "num_tokens": 9955735.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.7525759400508497, "frac_reward_zero_std": 1.0, "grad_norm": 0.06457605155775258, "kl": 0.00307464599609375, "learning_rate": 2.5812414845181785e-07, "loss": 0.0001, "num_tokens": 9958668.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 401.625, "completions/mean_terminated_length": 401.625, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7528435701860029, "frac_reward_zero_std": 1.0, "grad_norm": 0.029498287458084842, "kl": 0.001865386962890625, "learning_rate": 2.578043217212069e-07, "loss": 0.0001, "num_tokens": 9963137.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 362.375, "completions/mean_terminated_length": 362.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.7531112003211562, "frac_reward_zero_std": 1.0, "grad_norm": 0.016729334023714477, "kl": 0.00159454345703125, "learning_rate": 2.5748474997861027e-07, "loss": 0.0001, "num_tokens": 9967112.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 240.25, "completions/mean_terminated_length": 240.25, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7533788304563094, "frac_reward_zero_std": 1.0, "grad_norm": 0.06811797022971894, "kl": 0.001628875732421875, "learning_rate": 2.571654335029065e-07, "loss": 0.0001, "num_tokens": 9970102.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 307.0, "completions/mean_terminated_length": 307.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7536464605914626, "frac_reward_zero_std": 0.5, "grad_norm": 1.0536507795124364, "kl": 0.00170135498046875, "learning_rate": 2.5684637257275044e-07, "loss": -0.0085, "num_tokens": 9973702.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 271.375, "completions/mean_terminated_length": 271.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7539140907266159, "frac_reward_zero_std": 1.0, "grad_norm": 0.01616107937272534, "kl": 0.001239776611328125, "learning_rate": 2.5652756746657476e-07, "loss": 0.0, "num_tokens": 9976949.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 431.0, "completions/max_terminated_length": 431.0, "completions/mean_length": 317.875, "completions/mean_terminated_length": 317.875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.754181720861769, "frac_reward_zero_std": 1.0, "grad_norm": 0.011852094675570685, "kl": 0.000759124755859375, "learning_rate": 2.562090184625885e-07, "loss": 0.0, "num_tokens": 9980476.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 221.0, "completions/max_terminated_length": 221.0, "completions/mean_length": 179.375, "completions/mean_terminated_length": 179.375, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7544493509969222, "frac_reward_zero_std": 1.0, "grad_norm": 0.017939311875410088, "kl": 0.0012149810791015625, "learning_rate": 2.558907258387767e-07, "loss": 0.0, "num_tokens": 9982847.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 708.0, "completions/max_terminated_length": 708.0, "completions/mean_length": 453.75, "completions/mean_terminated_length": 453.75, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.7547169811320755, "frac_reward_zero_std": 0.5, "grad_norm": 0.6115240162105084, "kl": 0.0009098052978515625, "learning_rate": 2.5557268987290193e-07, "loss": 0.071, "num_tokens": 9987569.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7549846112672287, "frac_reward_zero_std": 1.0, "grad_norm": 0.02175158851413776, "kl": 0.00193023681640625, "learning_rate": 2.5525491084250136e-07, "loss": 0.0001, "num_tokens": 9991063.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 348.625, "completions/mean_terminated_length": 348.625, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.7552522414023819, "frac_reward_zero_std": 1.0, "grad_norm": 0.013633439881819762, "kl": 0.0012664794921875, "learning_rate": 2.5493738902488915e-07, "loss": 0.0001, "num_tokens": 9995020.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 317.375, "completions/mean_terminated_length": 317.375, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7555198715375351, "frac_reward_zero_std": 1.0, "grad_norm": 0.03239544028643804, "kl": 0.00226593017578125, "learning_rate": 2.546201246971542e-07, "loss": 0.0001, "num_tokens": 9998675.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7557875016726884, "frac_reward_zero_std": 1.0, "grad_norm": 0.013759028620417607, "kl": 0.001068115234375, "learning_rate": 2.543031181361609e-07, "loss": 0.0, "num_tokens": 10002222.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 275.25, "completions/mean_terminated_length": 275.25, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.7560551318078416, "frac_reward_zero_std": 1.0, "grad_norm": 0.020905929542561166, "kl": 0.0009822845458984375, "learning_rate": 2.5398636961854896e-07, "loss": 0.0, "num_tokens": 10005464.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7563227619429947, "frac_reward_zero_std": 1.0, "grad_norm": 0.013779734089542042, "kl": 0.000720977783203125, "learning_rate": 2.5366987942073265e-07, "loss": 0.0, "num_tokens": 10008500.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 225.0, "completions/mean_terminated_length": 225.0, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.756590392078148, "frac_reward_zero_std": 1.0, "grad_norm": 0.061734271712415724, "kl": 0.00330352783203125, "learning_rate": 2.533536478189009e-07, "loss": 0.0001, "num_tokens": 10011340.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 429.0, "completions/max_terminated_length": 429.0, "completions/mean_length": 315.125, "completions/mean_terminated_length": 315.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7568580222133012, "frac_reward_zero_std": 1.0, "grad_norm": 0.01602901978919326, "kl": 0.001056671142578125, "learning_rate": 2.530376750890169e-07, "loss": 0.0, "num_tokens": 10014881.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.7571256523484544, "frac_reward_zero_std": 1.0, "grad_norm": 0.2011820878340046, "kl": 0.008880615234375, "learning_rate": 2.5272196150681806e-07, "loss": 0.0004, "num_tokens": 10017942.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.7573932824836076, "frac_reward_zero_std": 1.0, "grad_norm": 0.02069058980178681, "kl": 0.001678466796875, "learning_rate": 2.5240650734781555e-07, "loss": 0.0001, "num_tokens": 10021222.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 244.0, "completions/max_terminated_length": 244.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.7576609126187609, "frac_reward_zero_std": 1.0, "grad_norm": 0.02400781814575477, "kl": 0.0019989013671875, "learning_rate": 2.5209131288729425e-07, "loss": 0.0001, "num_tokens": 10023943.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 322.25, "completions/mean_terminated_length": 322.25, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7579285427539141, "frac_reward_zero_std": 1.0, "grad_norm": 0.022138833525637065, "kl": 0.0012292861938476562, "learning_rate": 2.517763784003121e-07, "loss": 0.0, "num_tokens": 10027617.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.7581961728890673, "frac_reward_zero_std": 1.0, "grad_norm": 0.012540119856794627, "kl": 0.00101470947265625, "learning_rate": 2.5146170416170064e-07, "loss": 0.0, "num_tokens": 10030927.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 374.25, "completions/mean_terminated_length": 374.25, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.7584638030242206, "frac_reward_zero_std": 1.0, "grad_norm": 0.01961100922511129, "kl": 0.0021514892578125, "learning_rate": 2.51147290446064e-07, "loss": 0.0001, "num_tokens": 10035213.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 306.875, "completions/mean_terminated_length": 306.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.7587314331593737, "frac_reward_zero_std": 1.0, "grad_norm": 0.01414063934246403, "kl": 0.001758575439453125, "learning_rate": 2.508331375277789e-07, "loss": 0.0001, "num_tokens": 10038764.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7589990632945269, "frac_reward_zero_std": 1.0, "grad_norm": 0.03170760049020029, "kl": 0.00240325927734375, "learning_rate": 2.5051924568099486e-07, "loss": 0.0001, "num_tokens": 10041580.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 439.0, "completions/max_terminated_length": 439.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7592666934296802, "frac_reward_zero_std": 1.0, "grad_norm": 0.029243079079159413, "kl": 0.0023345947265625, "learning_rate": 2.502056151796331e-07, "loss": 0.0001, "num_tokens": 10045173.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 257.625, "completions/mean_terminated_length": 257.625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.7595343235648334, "frac_reward_zero_std": 1.0, "grad_norm": 0.012911216655820755, "kl": 0.000865936279296875, "learning_rate": 2.4989224629738704e-07, "loss": 0.0, "num_tokens": 10048158.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7598019536999866, "frac_reward_zero_std": 1.0, "grad_norm": 0.01971954072639006, "kl": 0.0010290145874023438, "learning_rate": 2.4957913930772176e-07, "loss": 0.0, "num_tokens": 10051462.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.7600695838351398, "frac_reward_zero_std": 0.5, "grad_norm": 1.3505051004833544, "kl": 0.00240325927734375, "learning_rate": 2.492662944838738e-07, "loss": -0.0251, "num_tokens": 10054517.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 448.25, "completions/mean_terminated_length": 448.25, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.7603372139702931, "frac_reward_zero_std": 1.0, "grad_norm": 0.023571948326177324, "kl": 0.001522064208984375, "learning_rate": 2.4895371209885084e-07, "loss": 0.0001, "num_tokens": 10059187.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.7606048441054463, "frac_reward_zero_std": 1.0, "grad_norm": 0.02642768274513921, "kl": 0.00160980224609375, "learning_rate": 2.486413924254318e-07, "loss": 0.0001, "num_tokens": 10062411.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 425.5, "completions/mean_terminated_length": 425.5, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "epoch": 0.7608724742405995, "frac_reward_zero_std": 1.0, "grad_norm": 0.030910128653222235, "kl": 0.0014190673828125, "learning_rate": 2.4832933573616557e-07, "loss": 0.0001, "num_tokens": 10066847.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 234.875, "completions/mean_terminated_length": 234.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.7611401043757527, "frac_reward_zero_std": 0.5, "grad_norm": 1.4401869320765721, "kl": 0.0024871826171875, "learning_rate": 2.480175423033729e-07, "loss": -0.0389, "num_tokens": 10069742.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 594.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 431.125, "completions/mean_terminated_length": 431.125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.7614077345109059, "frac_reward_zero_std": 0.5, "grad_norm": 0.5660513514269397, "kl": 0.0016021728515625, "learning_rate": 2.4770601239914323e-07, "loss": 0.0001, "num_tokens": 10074443.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.7616753646460591, "frac_reward_zero_std": 1.0, "grad_norm": 0.04117013442896923, "kl": 0.00238037109375, "learning_rate": 2.473947462953374e-07, "loss": 0.0001, "num_tokens": 10077604.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 235.5, "completions/mean_terminated_length": 235.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7619429947812124, "frac_reward_zero_std": 1.0, "grad_norm": 0.04105132029500101, "kl": 0.00229644775390625, "learning_rate": 2.470837442635854e-07, "loss": 0.0001, "num_tokens": 10080692.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7622106249163656, "frac_reward_zero_std": 1.0, "grad_norm": 0.028072334595900424, "kl": 0.0017852783203125, "learning_rate": 2.467730065752864e-07, "loss": 0.0001, "num_tokens": 10083321.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 311.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7624782550515188, "frac_reward_zero_std": 1.0, "grad_norm": 0.03158352869355994, "kl": 0.00281524658203125, "learning_rate": 2.4646253350160987e-07, "loss": 0.0001, "num_tokens": 10086869.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 238.5, "completions/mean_terminated_length": 238.5, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.762745885186672, "frac_reward_zero_std": 1.0, "grad_norm": 0.01938726870131581, "kl": 0.0025634765625, "learning_rate": 2.461523253134933e-07, "loss": 0.0001, "num_tokens": 10089857.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.7630135153218253, "frac_reward_zero_std": 1.0, "grad_norm": 0.03341997914941248, "kl": 0.000743865966796875, "learning_rate": 2.4584238228164406e-07, "loss": 0.0, "num_tokens": 10093229.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 246.5, "completions/mean_terminated_length": 246.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7632811454569784, "frac_reward_zero_std": 1.0, "grad_norm": 0.034759589873659326, "kl": 0.00312042236328125, "learning_rate": 2.4553270467653714e-07, "loss": 0.0001, "num_tokens": 10096221.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 355.5, "completions/mean_terminated_length": 355.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.7635487755921316, "frac_reward_zero_std": 0.5, "grad_norm": 0.7975927316929668, "kl": 0.002025604248046875, "learning_rate": 2.452232927684166e-07, "loss": 0.0001, "num_tokens": 10100281.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.7638164057272849, "frac_reward_zero_std": 1.0, "grad_norm": 0.01207282238092645, "kl": 0.0009860992431640625, "learning_rate": 2.449141468272943e-07, "loss": 0.0, "num_tokens": 10103921.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 335.75, "completions/mean_terminated_length": 335.75, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.7640840358624381, "frac_reward_zero_std": 1.0, "grad_norm": 0.020413363134809293, "kl": 0.001678466796875, "learning_rate": 2.446052671229502e-07, "loss": 0.0001, "num_tokens": 10107723.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 333.125, "completions/mean_terminated_length": 333.125, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.7643516659975913, "frac_reward_zero_std": 1.0, "grad_norm": 0.033240770107180316, "kl": 0.00255584716796875, "learning_rate": 2.442966539249318e-07, "loss": 0.0001, "num_tokens": 10111484.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 276.625, "completions/mean_terminated_length": 276.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.7646192961327446, "frac_reward_zero_std": 1.0, "grad_norm": 0.017908662201193914, "kl": 0.001617431640625, "learning_rate": 2.4398830750255397e-07, "loss": 0.0001, "num_tokens": 10114813.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 369.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.7648869262678978, "frac_reward_zero_std": 1.0, "grad_norm": 0.022043040583990862, "kl": 0.0015716552734375, "learning_rate": 2.4368022812489893e-07, "loss": 0.0001, "num_tokens": 10119037.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 359.375, "completions/mean_terminated_length": 359.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.765154556403051, "frac_reward_zero_std": 1.0, "grad_norm": 0.025155719831549798, "kl": 0.0019989013671875, "learning_rate": 2.4337241606081587e-07, "loss": 0.0001, "num_tokens": 10123080.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 275.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7654221865382042, "frac_reward_zero_std": 1.0, "grad_norm": 0.016260424051263607, "kl": 0.00191497802734375, "learning_rate": 2.430648715789205e-07, "loss": 0.0001, "num_tokens": 10126282.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.7656898166733574, "frac_reward_zero_std": 1.0, "grad_norm": 0.01421725807021712, "kl": 0.00130462646484375, "learning_rate": 2.4275759494759523e-07, "loss": 0.0001, "num_tokens": 10129901.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.7659574468085106, "frac_reward_zero_std": 1.0, "grad_norm": 0.017762008317588816, "kl": 0.001834869384765625, "learning_rate": 2.424505864349886e-07, "loss": 0.0001, "num_tokens": 10133345.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 379.0, "completions/mean_terminated_length": 379.0, "completions/min_length": 281.0, "completions/min_terminated_length": 281.0, "epoch": 0.7662250769436638, "frac_reward_zero_std": 1.0, "grad_norm": 0.009609464572356657, "kl": 0.0007781982421875, "learning_rate": 2.421438463090152e-07, "loss": 0.0, "num_tokens": 10137421.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 238.75, "completions/mean_terminated_length": 238.75, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.7664927070788171, "frac_reward_zero_std": 1.0, "grad_norm": 0.018795473485402746, "kl": 0.00146484375, "learning_rate": 2.418373748373555e-07, "loss": 0.0001, "num_tokens": 10140299.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 230.25, "completions/mean_terminated_length": 230.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7667603372139703, "frac_reward_zero_std": 0.5, "grad_norm": 1.0067009718571707, "kl": 0.0016326904296875, "learning_rate": 2.415311722874554e-07, "loss": 0.0001, "num_tokens": 10143225.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7670279673491235, "frac_reward_zero_std": 0.5, "grad_norm": 0.7406028476865691, "kl": 0.002361297607421875, "learning_rate": 2.4122523892652623e-07, "loss": 0.0001, "num_tokens": 10146808.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7672955974842768, "frac_reward_zero_std": 1.0, "grad_norm": 0.013263039956801807, "kl": 0.001190185546875, "learning_rate": 2.409195750215443e-07, "loss": 0.0, "num_tokens": 10149754.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 279.0, "completions/mean_terminated_length": 279.0, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.76756322761943, "frac_reward_zero_std": 1.0, "grad_norm": 0.014578234420770294, "kl": 0.001079559326171875, "learning_rate": 2.406141808392509e-07, "loss": 0.0, "num_tokens": 10153054.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 270.0, "completions/mean_terminated_length": 270.0, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7678308577545832, "frac_reward_zero_std": 1.0, "grad_norm": 0.01624340667212053, "kl": 0.00135040283203125, "learning_rate": 2.4030905664615175e-07, "loss": 0.0001, "num_tokens": 10156394.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 259.375, "completions/mean_terminated_length": 259.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7680984878897363, "frac_reward_zero_std": 1.0, "grad_norm": 0.030437119112243456, "kl": 0.0028533935546875, "learning_rate": 2.400042027085172e-07, "loss": 0.0001, "num_tokens": 10159377.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 314.875, "completions/mean_terminated_length": 314.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7683661180248896, "frac_reward_zero_std": 1.0, "grad_norm": 0.013455995887723718, "kl": 0.000797271728515625, "learning_rate": 2.3969961929238175e-07, "loss": 0.0, "num_tokens": 10162960.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 362.75, "completions/mean_terminated_length": 362.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.7686337481600428, "frac_reward_zero_std": 1.0, "grad_norm": 0.011677631386744622, "kl": 0.00127410888671875, "learning_rate": 2.3939530666354333e-07, "loss": 0.0001, "num_tokens": 10166846.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 917.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 505.375, "completions/mean_terminated_length": 505.375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.768901378295196, "frac_reward_zero_std": 0.5, "grad_norm": 0.6137158719876488, "kl": 0.00156402587890625, "learning_rate": 2.3909126508756445e-07, "loss": 0.0286, "num_tokens": 10171981.0, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 310.125, "completions/mean_terminated_length": 310.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.7691690084303493, "frac_reward_zero_std": 1.0, "grad_norm": 0.04529273571930467, "kl": 0.00250244140625, "learning_rate": 2.3878749482977004e-07, "loss": 0.0001, "num_tokens": 10175430.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 356.625, "completions/mean_terminated_length": 356.625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.7694366385655025, "frac_reward_zero_std": 1.0, "grad_norm": 0.02328614946042426, "kl": 0.0018157958984375, "learning_rate": 2.3848399615524943e-07, "loss": 0.0001, "num_tokens": 10179439.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 308.25, "completions/mean_terminated_length": 308.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.7697042687006557, "frac_reward_zero_std": 0.5, "grad_norm": 0.9536806546912733, "kl": 0.001667022705078125, "learning_rate": 2.381807693288539e-07, "loss": 0.0455, "num_tokens": 10182845.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 333.75, "completions/mean_terminated_length": 333.75, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.769971898835809, "frac_reward_zero_std": 1.0, "grad_norm": 0.029180244046414408, "kl": 0.00263214111328125, "learning_rate": 2.3787781461519788e-07, "loss": 0.0001, "num_tokens": 10186579.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 382.125, "completions/mean_terminated_length": 382.125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.7702395289709622, "frac_reward_zero_std": 1.0, "grad_norm": 0.015094549966810096, "kl": 0.001544952392578125, "learning_rate": 2.3757513227865876e-07, "loss": 0.0001, "num_tokens": 10190788.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 450.875, "completions/mean_terminated_length": 450.875, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.7705071591061153, "frac_reward_zero_std": 0.5, "grad_norm": 0.6265575089718793, "kl": 0.001934051513671875, "learning_rate": 2.3727272258337542e-07, "loss": 0.0533, "num_tokens": 10195519.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 276.0, "completions/mean_terminated_length": 276.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.7707747892412685, "frac_reward_zero_std": 0.5, "grad_norm": 1.206900463401866, "kl": 0.0021209716796875, "learning_rate": 2.3697058579324973e-07, "loss": -0.0057, "num_tokens": 10198851.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 597.0, "completions/max_terminated_length": 597.0, "completions/mean_length": 444.125, "completions/mean_terminated_length": 444.125, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.7710424193764218, "frac_reward_zero_std": 0.5, "grad_norm": 0.6615897311212465, "kl": 0.0015106201171875, "learning_rate": 2.3666872217194455e-07, "loss": 0.0001, "num_tokens": 10203668.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 321.375, "completions/mean_terminated_length": 321.375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.771310049511575, "frac_reward_zero_std": 1.0, "grad_norm": 0.015465104583869453, "kl": 0.001750946044921875, "learning_rate": 2.3636713198288492e-07, "loss": 0.0001, "num_tokens": 10207163.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 311.375, "completions/mean_terminated_length": 311.375, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.7715776796467282, "frac_reward_zero_std": 1.0, "grad_norm": 0.007187236482684877, "kl": 0.0005092620849609375, "learning_rate": 2.360658154892569e-07, "loss": 0.0, "num_tokens": 10210962.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 230.125, "completions/mean_terminated_length": 230.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.7718453097818815, "frac_reward_zero_std": 1.0, "grad_norm": 0.02989209376251148, "kl": 0.00146484375, "learning_rate": 2.3576477295400817e-07, "loss": 0.0001, "num_tokens": 10213715.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 304.0, "completions/mean_terminated_length": 304.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.7721129399170347, "frac_reward_zero_std": 1.0, "grad_norm": 0.03101387652177723, "kl": 0.00229644775390625, "learning_rate": 2.3546400463984687e-07, "loss": 0.0001, "num_tokens": 10217351.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.7723805700521879, "frac_reward_zero_std": 1.0, "grad_norm": 0.01049021089495107, "kl": 0.00083160400390625, "learning_rate": 2.3516351080924204e-07, "loss": 0.0, "num_tokens": 10220274.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 443.0, "completions/max_terminated_length": 443.0, "completions/mean_length": 395.375, "completions/mean_terminated_length": 395.375, "completions/min_length": 343.0, "completions/min_terminated_length": 343.0, "epoch": 0.7726482001873411, "frac_reward_zero_std": 1.0, "grad_norm": 0.01589447740201892, "kl": 0.001514434814453125, "learning_rate": 2.3486329172442323e-07, "loss": 0.0001, "num_tokens": 10224737.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 312.5, "completions/mean_terminated_length": 312.5, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.7729158303224943, "frac_reward_zero_std": 1.0, "grad_norm": 0.011463715240234457, "kl": 0.0008697509765625, "learning_rate": 2.3456334764738016e-07, "loss": 0.0, "num_tokens": 10228233.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 324.875, "completions/mean_terminated_length": 324.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.7731834604576475, "frac_reward_zero_std": 1.0, "grad_norm": 0.011100803822776863, "kl": 0.00086212158203125, "learning_rate": 2.3426367883986256e-07, "loss": 0.0, "num_tokens": 10231944.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7734510905928007, "frac_reward_zero_std": 1.0, "grad_norm": 0.028963856056931282, "kl": 0.0028533935546875, "learning_rate": 2.3396428556337997e-07, "loss": 0.0001, "num_tokens": 10235820.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 302.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.773718720727954, "frac_reward_zero_std": 1.0, "grad_norm": 0.017280302242991142, "kl": 0.002483367919921875, "learning_rate": 2.3366516807920147e-07, "loss": 0.0001, "num_tokens": 10239388.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 240.375, "completions/mean_terminated_length": 240.375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7739863508631072, "frac_reward_zero_std": 1.0, "grad_norm": 0.026848893828252063, "kl": 0.00258636474609375, "learning_rate": 2.333663266483555e-07, "loss": 0.0001, "num_tokens": 10242303.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.7742539809982604, "frac_reward_zero_std": 1.0, "grad_norm": 0.010578802213459236, "kl": 0.001251220703125, "learning_rate": 2.3306776153162955e-07, "loss": 0.0001, "num_tokens": 10246115.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.7745216111334137, "frac_reward_zero_std": 1.0, "grad_norm": 0.019783881836988322, "kl": 0.00189971923828125, "learning_rate": 2.3276947298956996e-07, "loss": 0.0001, "num_tokens": 10249516.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 231.0, "completions/mean_terminated_length": 231.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7747892412685669, "frac_reward_zero_std": 1.0, "grad_norm": 0.015340425307724469, "kl": 0.00146484375, "learning_rate": 2.324714612824818e-07, "loss": 0.0001, "num_tokens": 10252340.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 476.75, "completions/mean_terminated_length": 476.75, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.77505687140372, "frac_reward_zero_std": 1.0, "grad_norm": 0.021435584053714422, "kl": 0.001094818115234375, "learning_rate": 2.321737266704285e-07, "loss": 0.0, "num_tokens": 10257502.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 243.0, "completions/max_terminated_length": 243.0, "completions/mean_length": 185.625, "completions/mean_terminated_length": 185.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7753245015388732, "frac_reward_zero_std": 1.0, "grad_norm": 0.01797446632618675, "kl": 0.001247406005859375, "learning_rate": 2.3187626941323167e-07, "loss": 0.0, "num_tokens": 10259915.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 446.0, "completions/mean_terminated_length": 446.0, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7755921316740265, "frac_reward_zero_std": 0.5, "grad_norm": 0.4968790957214807, "kl": 0.0015106201171875, "learning_rate": 2.3157908977047095e-07, "loss": -0.0402, "num_tokens": 10264655.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 281.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.7758597618091797, "frac_reward_zero_std": 1.0, "grad_norm": 0.030779584556523604, "kl": 0.0029144287109375, "learning_rate": 2.3128218800148364e-07, "loss": 0.0001, "num_tokens": 10267827.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.7761273919443329, "frac_reward_zero_std": 1.0, "grad_norm": 0.01591636571269476, "kl": 0.001194000244140625, "learning_rate": 2.3098556436536468e-07, "loss": 0.0, "num_tokens": 10271539.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 252.625, "completions/mean_terminated_length": 252.625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7763950220794862, "frac_reward_zero_std": 1.0, "grad_norm": 0.020432984200963986, "kl": 0.00176239013671875, "learning_rate": 2.3068921912096587e-07, "loss": 0.0001, "num_tokens": 10274708.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7766626522146394, "frac_reward_zero_std": 1.0, "grad_norm": 0.016687312740516075, "kl": 0.00193023681640625, "learning_rate": 2.3039315252689686e-07, "loss": 0.0001, "num_tokens": 10277747.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 462.625, "completions/mean_terminated_length": 462.625, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.7769302823497926, "frac_reward_zero_std": 1.0, "grad_norm": 0.011765576965791981, "kl": 0.0010128021240234375, "learning_rate": 2.3009736484152323e-07, "loss": 0.0, "num_tokens": 10282740.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 264.0, "completions/mean_terminated_length": 264.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.7771979124849459, "frac_reward_zero_std": 1.0, "grad_norm": 0.023834695074048826, "kl": 0.00159454345703125, "learning_rate": 2.2980185632296796e-07, "loss": 0.0001, "num_tokens": 10285868.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 401.0, "completions/max_terminated_length": 401.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.777465542620099, "frac_reward_zero_std": 0.5, "grad_norm": 0.9705132107892225, "kl": 0.00289154052734375, "learning_rate": 2.2950662722910985e-07, "loss": -0.0044, "num_tokens": 10289219.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 583.0, "completions/max_terminated_length": 583.0, "completions/mean_length": 305.25, "completions/mean_terminated_length": 305.25, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7777331727552522, "frac_reward_zero_std": 1.0, "grad_norm": 0.016471553209174223, "kl": 0.001369476318359375, "learning_rate": 2.2921167781758388e-07, "loss": 0.0001, "num_tokens": 10292777.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 350.0, "completions/mean_terminated_length": 350.0, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.7780008028904054, "frac_reward_zero_std": 1.0, "grad_norm": 0.01221448602024562, "kl": 0.0010986328125, "learning_rate": 2.289170083457817e-07, "loss": 0.0, "num_tokens": 10296597.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 334.0, "completions/mean_terminated_length": 334.0, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.7782684330255587, "frac_reward_zero_std": 1.0, "grad_norm": 0.030826604011741213, "kl": 0.001434326171875, "learning_rate": 2.2862261907084948e-07, "loss": 0.0001, "num_tokens": 10300369.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 345.625, "completions/mean_terminated_length": 345.625, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "epoch": 0.7785360631607119, "frac_reward_zero_std": 1.0, "grad_norm": 0.015297308249986808, "kl": 0.0015411376953125, "learning_rate": 2.2832851024969007e-07, "loss": 0.0001, "num_tokens": 10304578.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 185.75, "completions/mean_terminated_length": 185.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.7788036932958651, "frac_reward_zero_std": 0.5, "grad_norm": 1.7211703712642952, "kl": 0.0037994384765625, "learning_rate": 2.2803468213896063e-07, "loss": -0.0107, "num_tokens": 10307004.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7790713234310184, "frac_reward_zero_std": 1.0, "grad_norm": 0.014754318194198408, "kl": 0.00118255615234375, "learning_rate": 2.277411349950738e-07, "loss": 0.0, "num_tokens": 10309768.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 226.125, "completions/mean_terminated_length": 226.125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.7793389535661716, "frac_reward_zero_std": 1.0, "grad_norm": 0.020629128259956723, "kl": 0.00147247314453125, "learning_rate": 2.27447869074197e-07, "loss": 0.0001, "num_tokens": 10312629.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 250.0, "completions/mean_terminated_length": 250.0, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.7796065837013247, "frac_reward_zero_std": 1.0, "grad_norm": 0.029656256562678052, "kl": 0.00159454345703125, "learning_rate": 2.2715488463225223e-07, "loss": 0.0001, "num_tokens": 10315601.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 311.75, "completions/mean_terminated_length": 311.75, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.779874213836478, "frac_reward_zero_std": 1.0, "grad_norm": 0.023437606190607824, "kl": 0.002178192138671875, "learning_rate": 2.2686218192491586e-07, "loss": 0.0001, "num_tokens": 10319159.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 437.625, "completions/mean_terminated_length": 437.625, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.7801418439716312, "frac_reward_zero_std": 0.0, "grad_norm": 1.1106312081476768, "kl": 0.0020294189453125, "learning_rate": 2.2656976120761815e-07, "loss": 0.056, "num_tokens": 10324004.0, "reward": 0.5, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 292.375, "completions/mean_terminated_length": 292.375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.7804094741067844, "frac_reward_zero_std": 1.0, "grad_norm": 0.017830879826430862, "kl": 0.002105712890625, "learning_rate": 2.2627762273554392e-07, "loss": 0.0001, "num_tokens": 10327283.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 277.75, "completions/mean_terminated_length": 277.75, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.7806771042419376, "frac_reward_zero_std": 1.0, "grad_norm": 0.038112400220176486, "kl": 0.00232696533203125, "learning_rate": 2.259857667636309e-07, "loss": 0.0001, "num_tokens": 10330553.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7809447343770909, "frac_reward_zero_std": 1.0, "grad_norm": 0.022264949487330623, "kl": 0.0018463134765625, "learning_rate": 2.2569419354657077e-07, "loss": 0.0001, "num_tokens": 10334162.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 352.875, "completions/mean_terminated_length": 352.875, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.7812123645122441, "frac_reward_zero_std": 1.0, "grad_norm": 0.027928389886362388, "kl": 0.001617431640625, "learning_rate": 2.2540290333880835e-07, "loss": 0.0001, "num_tokens": 10338085.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 302.25, "completions/mean_terminated_length": 302.25, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.7814799946473973, "frac_reward_zero_std": 1.0, "grad_norm": 0.008783048653324677, "kl": 0.0008640289306640625, "learning_rate": 2.2511189639454147e-07, "loss": 0.0, "num_tokens": 10341515.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 273.0, "completions/max_terminated_length": 273.0, "completions/mean_length": 240.125, "completions/mean_terminated_length": 240.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.7817476247825506, "frac_reward_zero_std": 1.0, "grad_norm": 0.01573723711836777, "kl": 0.001438140869140625, "learning_rate": 2.2482117296772078e-07, "loss": 0.0001, "num_tokens": 10344420.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 323.5, "completions/mean_terminated_length": 323.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7820152549177037, "frac_reward_zero_std": 0.5, "grad_norm": 0.9647252182875391, "kl": 0.0011081695556640625, "learning_rate": 2.2453073331204956e-07, "loss": 0.0311, "num_tokens": 10348124.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.7822828850528569, "frac_reward_zero_std": 1.0, "grad_norm": 0.014264821324074011, "kl": 0.001094818115234375, "learning_rate": 2.2424057768098336e-07, "loss": 0.0, "num_tokens": 10351231.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 393.125, "completions/mean_terminated_length": 393.125, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.7825505151880102, "frac_reward_zero_std": 1.0, "grad_norm": 0.010634098585404494, "kl": 0.00112152099609375, "learning_rate": 2.2395070632773005e-07, "loss": 0.0, "num_tokens": 10355456.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 547.0, "completions/max_terminated_length": 547.0, "completions/mean_length": 473.25, "completions/mean_terminated_length": 473.25, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.7828181453231634, "frac_reward_zero_std": 0.5, "grad_norm": 0.7577792343380387, "kl": 0.0013580322265625, "learning_rate": 2.2366111950524903e-07, "loss": -0.0116, "num_tokens": 10360510.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 402.375, "completions/mean_terminated_length": 402.375, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "epoch": 0.7830857754583166, "frac_reward_zero_std": 0.5, "grad_norm": 0.7382019414757693, "kl": 0.00136566162109375, "learning_rate": 2.2337181746625185e-07, "loss": -0.0241, "num_tokens": 10365105.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 458.0, "completions/max_terminated_length": 458.0, "completions/mean_length": 346.625, "completions/mean_terminated_length": 346.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.7833534055934698, "frac_reward_zero_std": 1.0, "grad_norm": 0.013489788339178025, "kl": 0.001617431640625, "learning_rate": 2.2308280046320135e-07, "loss": 0.0001, "num_tokens": 10369162.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 395.125, "completions/mean_terminated_length": 395.125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.7836210357286231, "frac_reward_zero_std": 0.5, "grad_norm": 0.7167188711541481, "kl": 0.001781463623046875, "learning_rate": 2.2279406874831164e-07, "loss": -0.0099, "num_tokens": 10373579.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 437.5, "completions/mean_terminated_length": 437.5, "completions/min_length": 379.0, "completions/min_terminated_length": 379.0, "epoch": 0.7838886658637763, "frac_reward_zero_std": 0.5, "grad_norm": 0.7023830263713676, "kl": 0.0015869140625, "learning_rate": 2.2250562257354795e-07, "loss": -0.0188, "num_tokens": 10378251.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.7841562959989294, "frac_reward_zero_std": 0.5, "grad_norm": 0.8706180413871731, "kl": 0.0020904541015625, "learning_rate": 2.2221746219062592e-07, "loss": -0.0842, "num_tokens": 10381291.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 373.125, "completions/mean_terminated_length": 373.125, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.7844239261340827, "frac_reward_zero_std": 1.0, "grad_norm": 0.014937765308247064, "kl": 0.001415252685546875, "learning_rate": 2.2192958785101256e-07, "loss": 0.0001, "num_tokens": 10385236.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.7846915562692359, "frac_reward_zero_std": 1.0, "grad_norm": 0.015687936832644172, "kl": 0.00145721435546875, "learning_rate": 2.2164199980592436e-07, "loss": 0.0001, "num_tokens": 10388812.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.7849591864043891, "frac_reward_zero_std": 1.0, "grad_norm": 0.01374211992485745, "kl": 0.001270294189453125, "learning_rate": 2.213546983063289e-07, "loss": 0.0001, "num_tokens": 10392111.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 409.125, "completions/mean_terminated_length": 409.125, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.7852268165395424, "frac_reward_zero_std": 1.0, "grad_norm": 0.016811792552504874, "kl": 0.00156402587890625, "learning_rate": 2.2106768360294287e-07, "loss": 0.0001, "num_tokens": 10396460.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 255.375, "completions/mean_terminated_length": 255.375, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.7854944466746956, "frac_reward_zero_std": 1.0, "grad_norm": 0.0210308741148969, "kl": 0.001094818115234375, "learning_rate": 2.2078095594623302e-07, "loss": 0.0, "num_tokens": 10399543.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 347.0, "completions/mean_terminated_length": 347.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.7857620768098488, "frac_reward_zero_std": 1.0, "grad_norm": 0.013313527786719985, "kl": 0.0011749267578125, "learning_rate": 2.2049451558641603e-07, "loss": 0.0, "num_tokens": 10403923.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 334.5, "completions/mean_terminated_length": 334.5, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.786029706945002, "frac_reward_zero_std": 1.0, "grad_norm": 0.052053259967082634, "kl": 0.001499176025390625, "learning_rate": 2.2020836277345694e-07, "loss": 0.0001, "num_tokens": 10407711.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.7862973370801553, "frac_reward_zero_std": 1.0, "grad_norm": 0.02063280154651199, "kl": 0.0012969970703125, "learning_rate": 2.1992249775707099e-07, "loss": 0.0001, "num_tokens": 10410645.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 455.375, "completions/mean_terminated_length": 455.375, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.7865649672153084, "frac_reward_zero_std": 0.5, "grad_norm": 0.7488836628406806, "kl": 0.0012054443359375, "learning_rate": 2.1963692078672105e-07, "loss": -0.071, "num_tokens": 10415436.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 393.375, "completions/mean_terminated_length": 393.375, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7868325973504616, "frac_reward_zero_std": 1.0, "grad_norm": 0.02042613062539934, "kl": 0.002166748046875, "learning_rate": 2.1935163211161984e-07, "loss": 0.0001, "num_tokens": 10419771.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 302.5, "completions/mean_terminated_length": 302.5, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.7871002274856149, "frac_reward_zero_std": 1.0, "grad_norm": 0.021927660275014915, "kl": 0.001312255859375, "learning_rate": 2.1906663198072733e-07, "loss": 0.0001, "num_tokens": 10423203.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.7873678576207681, "frac_reward_zero_std": 0.5, "grad_norm": 0.6926240062863208, "kl": 0.00197601318359375, "learning_rate": 2.1878192064275262e-07, "loss": 0.0023, "num_tokens": 10427356.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 411.875, "completions/mean_terminated_length": 411.875, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.7876354877559213, "frac_reward_zero_std": 1.0, "grad_norm": 0.020435345776970974, "kl": 0.0016632080078125, "learning_rate": 2.1849749834615233e-07, "loss": 0.0001, "num_tokens": 10432035.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 359.875, "completions/mean_terminated_length": 359.875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.7879031178910746, "frac_reward_zero_std": 0.5, "grad_norm": 0.9846389591725069, "kl": 0.002330780029296875, "learning_rate": 2.182133653391307e-07, "loss": 0.0205, "num_tokens": 10435938.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 477.0, "completions/max_terminated_length": 477.0, "completions/mean_length": 374.75, "completions/mean_terminated_length": 374.75, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7881707480262278, "frac_reward_zero_std": 1.0, "grad_norm": 0.010814872364108724, "kl": 0.00086212158203125, "learning_rate": 2.1792952186964037e-07, "loss": 0.0, "num_tokens": 10440088.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 389.375, "completions/mean_terminated_length": 389.375, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.788438378161381, "frac_reward_zero_std": 1.0, "grad_norm": 0.016161735532921412, "kl": 0.00170135498046875, "learning_rate": 2.176459681853801e-07, "loss": 0.0001, "num_tokens": 10444507.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 519.125, "completions/mean_terminated_length": 447.0000305175781, "completions/min_length": 340.0, "completions/min_terminated_length": 340.0, "epoch": 0.7887060082965341, "frac_reward_zero_std": 0.5, "grad_norm": 0.7360206408331892, "kl": 0.001617431640625, "learning_rate": 2.1736270453379661e-07, "loss": 0.1254, "num_tokens": 10449960.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 413.5, "completions/mean_terminated_length": 413.5, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.7889736384316874, "frac_reward_zero_std": 1.0, "grad_norm": 0.013123220137020646, "kl": 0.00135040283203125, "learning_rate": 2.170797311620833e-07, "loss": 0.0001, "num_tokens": 10454388.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 571.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 430.375, "completions/mean_terminated_length": 430.375, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.7892412685668406, "frac_reward_zero_std": 0.5, "grad_norm": 0.6726420848518769, "kl": 0.00115966796875, "learning_rate": 2.1679704831718014e-07, "loss": -0.0434, "num_tokens": 10459079.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 357.75, "completions/mean_terminated_length": 357.75, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7895088987019938, "frac_reward_zero_std": 1.0, "grad_norm": 0.010812416415007978, "kl": 0.000850677490234375, "learning_rate": 2.165146562457737e-07, "loss": 0.0, "num_tokens": 10463117.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 866.0, "completions/max_terminated_length": 866.0, "completions/mean_length": 360.25, "completions/mean_terminated_length": 360.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.7897765288371471, "frac_reward_zero_std": 0.5, "grad_norm": 0.6565325071129956, "kl": 0.0010223388671875, "learning_rate": 2.1623255519429676e-07, "loss": 0.0394, "num_tokens": 10467015.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 294.625, "completions/mean_terminated_length": 294.625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.7900441589723003, "frac_reward_zero_std": 1.0, "grad_norm": 0.012477448446009856, "kl": 0.0008831024169921875, "learning_rate": 2.1595074540892816e-07, "loss": 0.0, "num_tokens": 10470520.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 516.0, "completions/max_terminated_length": 516.0, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.7903117891074535, "frac_reward_zero_std": 1.0, "grad_norm": 0.01614854919961012, "kl": 0.001434326171875, "learning_rate": 2.1566922713559266e-07, "loss": 0.0001, "num_tokens": 10474180.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 239.625, "completions/mean_terminated_length": 239.625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.7905794192426067, "frac_reward_zero_std": 1.0, "grad_norm": 0.012844027728549682, "kl": 0.00118255615234375, "learning_rate": 2.1538800061996006e-07, "loss": 0.0, "num_tokens": 10477165.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 301.125, "completions/mean_terminated_length": 301.125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.79084704937776, "frac_reward_zero_std": 1.0, "grad_norm": 0.009128184043355922, "kl": 0.0008335113525390625, "learning_rate": 2.151070661074465e-07, "loss": 0.0, "num_tokens": 10480550.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.7911146795129131, "frac_reward_zero_std": 1.0, "grad_norm": 0.01684800740702854, "kl": 0.00124359130859375, "learning_rate": 2.1482642384321264e-07, "loss": 0.0, "num_tokens": 10484258.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 319.75, "completions/mean_terminated_length": 319.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.7913823096480663, "frac_reward_zero_std": 1.0, "grad_norm": 0.014106144778160377, "kl": 0.0012454986572265625, "learning_rate": 2.1454607407216421e-07, "loss": 0.0, "num_tokens": 10487812.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 272.625, "completions/mean_terminated_length": 272.625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.7916499397832196, "frac_reward_zero_std": 1.0, "grad_norm": 0.017382523868235397, "kl": 0.001621246337890625, "learning_rate": 2.1426601703895192e-07, "loss": 0.0001, "num_tokens": 10491289.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 413.375, "completions/mean_terminated_length": 413.375, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.7919175699183728, "frac_reward_zero_std": 1.0, "grad_norm": 0.014238256838484923, "kl": 0.0013885498046875, "learning_rate": 2.1398625298797053e-07, "loss": 0.0001, "num_tokens": 10495748.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 258.0, "completions/max_terminated_length": 258.0, "completions/mean_length": 192.625, "completions/mean_terminated_length": 192.625, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.792185200053526, "frac_reward_zero_std": 1.0, "grad_norm": 0.013935202503845301, "kl": 0.0008592605590820312, "learning_rate": 2.1370678216335986e-07, "loss": 0.0, "num_tokens": 10498181.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 481.625, "completions/mean_terminated_length": 481.625, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.7924528301886793, "frac_reward_zero_std": 0.5, "grad_norm": 0.5917448101555487, "kl": 0.002593994140625, "learning_rate": 2.1342760480900296e-07, "loss": 0.0173, "num_tokens": 10503234.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 335.125, "completions/mean_terminated_length": 335.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.7927204603238325, "frac_reward_zero_std": 1.0, "grad_norm": 0.012935699049914123, "kl": 0.001178741455078125, "learning_rate": 2.1314872116852778e-07, "loss": 0.0, "num_tokens": 10506899.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 312.625, "completions/mean_terminated_length": 312.625, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.7929880904589857, "frac_reward_zero_std": 1.0, "grad_norm": 0.014245180123891775, "kl": 0.00124359130859375, "learning_rate": 2.128701314853049e-07, "loss": 0.0, "num_tokens": 10510396.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.7932557205941388, "frac_reward_zero_std": 1.0, "grad_norm": 0.011492718549260076, "kl": 0.0007305145263671875, "learning_rate": 2.1259183600244932e-07, "loss": 0.0, "num_tokens": 10513721.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 535.0, "completions/max_terminated_length": 535.0, "completions/mean_length": 363.375, "completions/mean_terminated_length": 363.375, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.7935233507292921, "frac_reward_zero_std": 1.0, "grad_norm": 0.029667676817511986, "kl": 0.0019378662109375, "learning_rate": 2.123138349628188e-07, "loss": 0.0001, "num_tokens": 10517828.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 338.25, "completions/mean_terminated_length": 338.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.7937909808644453, "frac_reward_zero_std": 1.0, "grad_norm": 0.029698144405990858, "kl": 0.00247955322265625, "learning_rate": 2.1203612860901393e-07, "loss": 0.0001, "num_tokens": 10521706.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 279.25, "completions/mean_terminated_length": 279.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.7940586109995985, "frac_reward_zero_std": 1.0, "grad_norm": 0.03519162777598787, "kl": 0.002716064453125, "learning_rate": 2.117587171833789e-07, "loss": 0.0001, "num_tokens": 10525024.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 388.5, "completions/mean_terminated_length": 388.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.7943262411347518, "frac_reward_zero_std": 1.0, "grad_norm": 0.017589300344455383, "kl": 0.001308441162109375, "learning_rate": 2.1148160092799963e-07, "loss": 0.0001, "num_tokens": 10529380.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 327.75, "completions/mean_terminated_length": 327.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.794593871269905, "frac_reward_zero_std": 0.5, "grad_norm": 0.5912795597440353, "kl": 0.001438140869140625, "learning_rate": 2.1120478008470537e-07, "loss": -0.0058, "num_tokens": 10533130.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 507.0, "completions/max_terminated_length": 507.0, "completions/mean_length": 436.5, "completions/mean_terminated_length": 436.5, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7948615014050582, "frac_reward_zero_std": 0.5, "grad_norm": 0.762471217045676, "kl": 0.001735687255859375, "learning_rate": 2.1092825489506672e-07, "loss": 0.004, "num_tokens": 10537650.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 483.125, "completions/mean_terminated_length": 405.8571472167969, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.7951291315402115, "frac_reward_zero_std": 1.0, "grad_norm": 0.02323158680939592, "kl": 0.001575469970703125, "learning_rate": 2.1065202560039674e-07, "loss": 0.0001, "num_tokens": 10542859.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 208.25, "completions/mean_terminated_length": 208.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.7953967616753647, "frac_reward_zero_std": 0.5, "grad_norm": 1.0060159430044777, "kl": 0.00144195556640625, "learning_rate": 2.1037609244175028e-07, "loss": 0.0799, "num_tokens": 10545673.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 395.625, "completions/mean_terminated_length": 395.625, "completions/min_length": 326.0, "completions/min_terminated_length": 326.0, "epoch": 0.7956643918105178, "frac_reward_zero_std": 0.5, "grad_norm": 0.6946428047458462, "kl": 0.0010929107666015625, "learning_rate": 2.1010045565992363e-07, "loss": -0.004, "num_tokens": 10549962.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 331.625, "completions/mean_terminated_length": 331.625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.795932021945671, "frac_reward_zero_std": 0.5, "grad_norm": 1.20143399675072, "kl": 0.00292205810546875, "learning_rate": 2.098251154954544e-07, "loss": 0.0068, "num_tokens": 10553683.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 332.875, "completions/mean_terminated_length": 332.875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.7961996520808243, "frac_reward_zero_std": 1.0, "grad_norm": 0.010691724189577284, "kl": 0.00127410888671875, "learning_rate": 2.0955007218862163e-07, "loss": 0.0001, "num_tokens": 10557258.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 371.5, "completions/mean_terminated_length": 371.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.7964672822159775, "frac_reward_zero_std": 1.0, "grad_norm": 0.012238091041190955, "kl": 0.00119781494140625, "learning_rate": 2.0927532597944494e-07, "loss": 0.0, "num_tokens": 10561258.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 381.75, "completions/mean_terminated_length": 381.75, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.7967349123511307, "frac_reward_zero_std": 1.0, "grad_norm": 0.02257599156470515, "kl": 0.001651763916015625, "learning_rate": 2.09000877107685e-07, "loss": 0.0001, "num_tokens": 10565604.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 500.25, "completions/mean_terminated_length": 500.25, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 0.797002542486284, "frac_reward_zero_std": 1.0, "grad_norm": 0.040879638984973425, "kl": 0.002017974853515625, "learning_rate": 2.0872672581284272e-07, "loss": 0.0001, "num_tokens": 10570810.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 262.125, "completions/mean_terminated_length": 262.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.7972701726214372, "frac_reward_zero_std": 1.0, "grad_norm": 0.02466995555132043, "kl": 0.00208282470703125, "learning_rate": 2.084528723341596e-07, "loss": 0.0001, "num_tokens": 10573955.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 267.125, "completions/mean_terminated_length": 267.125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7975378027565904, "frac_reward_zero_std": 1.0, "grad_norm": 0.01987632653522615, "kl": 0.00168609619140625, "learning_rate": 2.0817931691061707e-07, "loss": 0.0001, "num_tokens": 10577052.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.7978054328917437, "frac_reward_zero_std": 1.0, "grad_norm": 0.058965862343878676, "kl": 0.0037689208984375, "learning_rate": 2.079060597809365e-07, "loss": 0.0002, "num_tokens": 10580323.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 348.875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.7980730630268968, "frac_reward_zero_std": 0.5, "grad_norm": 0.7218548734929879, "kl": 0.001689910888671875, "learning_rate": 2.0763310118357892e-07, "loss": 0.0493, "num_tokens": 10584098.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 236.25, "completions/mean_terminated_length": 236.25, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.79834069316205, "frac_reward_zero_std": 1.0, "grad_norm": 0.02809665932929524, "kl": 0.00148773193359375, "learning_rate": 2.073604413567449e-07, "loss": 0.0001, "num_tokens": 10586980.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 329.375, "completions/mean_terminated_length": 329.375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.7986083232972032, "frac_reward_zero_std": 0.5, "grad_norm": 0.9123366322821888, "kl": 0.00139617919921875, "learning_rate": 2.0708808053837428e-07, "loss": 0.0652, "num_tokens": 10590619.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 262.5, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.7988759534323565, "frac_reward_zero_std": 1.0, "grad_norm": 0.024228813116244936, "kl": 0.001953125, "learning_rate": 2.0681601896614598e-07, "loss": 0.0001, "num_tokens": 10593775.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.7991435835675097, "frac_reward_zero_std": 1.0, "grad_norm": 0.026250626489271064, "kl": 0.002269744873046875, "learning_rate": 2.0654425687747767e-07, "loss": 0.0001, "num_tokens": 10597439.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 277.625, "completions/mean_terminated_length": 277.625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.7994112137026629, "frac_reward_zero_std": 1.0, "grad_norm": 0.043650818960176266, "kl": 0.00240325927734375, "learning_rate": 2.0627279450952584e-07, "loss": 0.0001, "num_tokens": 10600700.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 403.625, "completions/mean_terminated_length": 403.625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.7996788438378162, "frac_reward_zero_std": 1.0, "grad_norm": 0.015215921992821112, "kl": 0.001316070556640625, "learning_rate": 2.060016320991853e-07, "loss": 0.0001, "num_tokens": 10605009.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 276.5, "completions/mean_terminated_length": 276.5, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.7999464739729694, "frac_reward_zero_std": 1.0, "grad_norm": 0.04281949825273419, "kl": 0.001789093017578125, "learning_rate": 2.0573076988308925e-07, "loss": 0.0001, "num_tokens": 10608233.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8002141041081225, "frac_reward_zero_std": 1.0, "grad_norm": 0.019290977217474997, "kl": 0.001308441162109375, "learning_rate": 2.054602080976084e-07, "loss": 0.0001, "num_tokens": 10611755.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 424.25, "completions/mean_terminated_length": 424.25, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.8004817342432758, "frac_reward_zero_std": 0.5, "grad_norm": 1.0327863096837, "kl": 0.0023193359375, "learning_rate": 2.051899469788522e-07, "loss": -0.0018, "num_tokens": 10616493.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.800749364378429, "frac_reward_zero_std": 1.0, "grad_norm": 0.019220779475623156, "kl": 0.00159454345703125, "learning_rate": 2.0491998676266676e-07, "loss": 0.0001, "num_tokens": 10620039.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 269.625, "completions/mean_terminated_length": 269.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.8010169945135822, "frac_reward_zero_std": 1.0, "grad_norm": 0.015547564892390972, "kl": 0.00107574462890625, "learning_rate": 2.0465032768463642e-07, "loss": 0.0, "num_tokens": 10623428.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 284.375, "completions/mean_terminated_length": 284.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8012846246487354, "frac_reward_zero_std": 1.0, "grad_norm": 0.018326251775136315, "kl": 0.001117706298828125, "learning_rate": 2.0438096998008237e-07, "loss": 0.0, "num_tokens": 10626739.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 206.75, "completions/mean_terminated_length": 206.75, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8015522547838887, "frac_reward_zero_std": 1.0, "grad_norm": 0.029688858707252283, "kl": 0.001354217529296875, "learning_rate": 2.041119138840623e-07, "loss": 0.0001, "num_tokens": 10629381.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 410.125, "completions/mean_terminated_length": 410.125, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.8018198849190419, "frac_reward_zero_std": 0.0, "grad_norm": 1.3920999512655428, "kl": 0.0021514892578125, "learning_rate": 2.038431596313719e-07, "loss": -0.0216, "num_tokens": 10633854.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 260.375, "completions/mean_terminated_length": 260.375, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8020875150541951, "frac_reward_zero_std": 1.0, "grad_norm": 0.01814499872239937, "kl": 0.0010051727294921875, "learning_rate": 2.0357470745654212e-07, "loss": 0.0, "num_tokens": 10636869.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 241.25, "completions/mean_terminated_length": 241.25, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8023551451893484, "frac_reward_zero_std": 1.0, "grad_norm": 0.028869270524470622, "kl": 0.00146484375, "learning_rate": 2.0330655759384147e-07, "loss": 0.0001, "num_tokens": 10639739.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 315.375, "completions/mean_terminated_length": 315.375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8026227753245015, "frac_reward_zero_std": 1.0, "grad_norm": 0.03172616604160161, "kl": 0.0023345947265625, "learning_rate": 2.0303871027727364e-07, "loss": 0.0001, "num_tokens": 10643498.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 2999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 520.0, "completions/max_terminated_length": 520.0, "completions/mean_length": 295.5, "completions/mean_terminated_length": 295.5, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.8028904054596547, "frac_reward_zero_std": 1.0, "grad_norm": 0.06679306687524166, "kl": 0.0039215087890625, "learning_rate": 2.02771165740579e-07, "loss": 0.0002, "num_tokens": 10646750.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 456.375, "completions/mean_terminated_length": 456.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.803158035594808, "frac_reward_zero_std": 1.0, "grad_norm": 0.027227930035556333, "kl": 0.0020313262939453125, "learning_rate": 2.025039242172334e-07, "loss": 0.0001, "num_tokens": 10651605.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 216.625, "completions/mean_terminated_length": 216.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8034256657299612, "frac_reward_zero_std": 1.0, "grad_norm": 0.02841218523362285, "kl": 0.001953125, "learning_rate": 2.022369859404483e-07, "loss": 0.0001, "num_tokens": 10654310.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 485.25, "completions/mean_terminated_length": 485.25, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.8036932958651144, "frac_reward_zero_std": 1.0, "grad_norm": 0.014552750769772268, "kl": 0.001495361328125, "learning_rate": 2.0197035114317057e-07, "loss": 0.0001, "num_tokens": 10659716.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 180.25, "completions/mean_terminated_length": 180.25, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.8039609260002676, "frac_reward_zero_std": 0.5, "grad_norm": 0.939351681782119, "kl": 0.001125335693359375, "learning_rate": 2.017040200580822e-07, "loss": -0.0199, "num_tokens": 10662134.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 350.5, "completions/mean_terminated_length": 350.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8042285561354209, "frac_reward_zero_std": 0.0, "grad_norm": 1.2842025979739962, "kl": 0.002105712890625, "learning_rate": 2.0143799291760017e-07, "loss": 0.0843, "num_tokens": 10665966.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 446.5, "completions/mean_terminated_length": 446.5, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.8044961862705741, "frac_reward_zero_std": 1.0, "grad_norm": 0.02798749484252194, "kl": 0.002162933349609375, "learning_rate": 2.0117226995387625e-07, "loss": 0.0001, "num_tokens": 10670594.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8047638164057273, "frac_reward_zero_std": 1.0, "grad_norm": 0.025093002068885484, "kl": 0.002353668212890625, "learning_rate": 2.0090685139879664e-07, "loss": 0.0001, "num_tokens": 10673623.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 474.0, "completions/max_terminated_length": 474.0, "completions/mean_length": 330.125, "completions/mean_terminated_length": 330.125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8050314465408805, "frac_reward_zero_std": 1.0, "grad_norm": 0.047873865139850064, "kl": 0.00347900390625, "learning_rate": 2.0064173748398203e-07, "loss": 0.0001, "num_tokens": 10677256.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 280.5, "completions/mean_terminated_length": 280.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8052990766760337, "frac_reward_zero_std": 1.0, "grad_norm": 0.017931836050416573, "kl": 0.001739501953125, "learning_rate": 2.0037692844078724e-07, "loss": 0.0001, "num_tokens": 10680616.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 592.0, "completions/max_terminated_length": 592.0, "completions/mean_length": 368.25, "completions/mean_terminated_length": 368.25, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.8055667068111869, "frac_reward_zero_std": 1.0, "grad_norm": 0.019493812527018067, "kl": 0.002349853515625, "learning_rate": 2.0011242450030098e-07, "loss": 0.0001, "num_tokens": 10684662.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8058343369463402, "frac_reward_zero_std": 1.0, "grad_norm": 0.008886658941070522, "kl": 0.000865936279296875, "learning_rate": 1.9984822589334577e-07, "loss": 0.0, "num_tokens": 10688192.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8061019670814934, "frac_reward_zero_std": 1.0, "grad_norm": 0.019756189791605115, "kl": 0.001422882080078125, "learning_rate": 1.9958433285047765e-07, "loss": 0.0001, "num_tokens": 10691603.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 255.625, "completions/mean_terminated_length": 255.625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8063695972166466, "frac_reward_zero_std": 1.0, "grad_norm": 0.019384517352580215, "kl": 0.0012645721435546875, "learning_rate": 1.993207456019859e-07, "loss": 0.0001, "num_tokens": 10694868.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 339.5, "completions/mean_terminated_length": 339.5, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.8066372273517998, "frac_reward_zero_std": 1.0, "grad_norm": 0.01640899893889185, "kl": 0.001544952392578125, "learning_rate": 1.990574643778932e-07, "loss": 0.0001, "num_tokens": 10698664.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 216.25, "completions/mean_terminated_length": 216.25, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8069048574869531, "frac_reward_zero_std": 1.0, "grad_norm": 0.02350673801343886, "kl": 0.00170135498046875, "learning_rate": 1.9879448940795497e-07, "loss": 0.0001, "num_tokens": 10701478.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 400.0, "completions/mean_terminated_length": 400.0, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.8071724876221062, "frac_reward_zero_std": 1.0, "grad_norm": 0.013484709549040826, "kl": 0.00164031982421875, "learning_rate": 1.9853182092165938e-07, "loss": 0.0001, "num_tokens": 10705654.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 413.875, "completions/mean_terminated_length": 413.875, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.8074401177572594, "frac_reward_zero_std": 0.5, "grad_norm": 0.6151580013248591, "kl": 0.001499176025390625, "learning_rate": 1.9826945914822723e-07, "loss": 0.0156, "num_tokens": 10710033.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 345.625, "completions/mean_terminated_length": 345.625, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8077077478924127, "frac_reward_zero_std": 1.0, "grad_norm": 0.018211896404077686, "kl": 0.0010356903076171875, "learning_rate": 1.9800740431661177e-07, "loss": 0.0, "num_tokens": 10713894.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8079753780275659, "frac_reward_zero_std": 0.5, "grad_norm": 0.9661931202234519, "kl": 0.0026702880859375, "learning_rate": 1.9774565665549783e-07, "loss": 0.0527, "num_tokens": 10717698.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8082430081627191, "frac_reward_zero_std": 1.0, "grad_norm": 0.013736741341470867, "kl": 0.0009517669677734375, "learning_rate": 1.974842163933031e-07, "loss": 0.0, "num_tokens": 10720860.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8085106382978723, "frac_reward_zero_std": 1.0, "grad_norm": 0.0382238396392259, "kl": 0.0015630722045898438, "learning_rate": 1.9722308375817615e-07, "loss": 0.0001, "num_tokens": 10724025.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 232.625, "completions/mean_terminated_length": 232.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8087782684330256, "frac_reward_zero_std": 1.0, "grad_norm": 0.10686271677163872, "kl": 0.00322723388671875, "learning_rate": 1.969622589779978e-07, "loss": 0.0001, "num_tokens": 10726814.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 317.875, "completions/mean_terminated_length": 317.875, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8090458985681788, "frac_reward_zero_std": 1.0, "grad_norm": 0.01483260761933772, "kl": 0.000965118408203125, "learning_rate": 1.9670174228037947e-07, "loss": 0.0, "num_tokens": 10730605.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 334.25, "completions/mean_terminated_length": 334.25, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.809313528703332, "frac_reward_zero_std": 1.0, "grad_norm": 0.021105458740286624, "kl": 0.00179290771484375, "learning_rate": 1.9644153389266426e-07, "loss": 0.0001, "num_tokens": 10734243.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8095811588384852, "frac_reward_zero_std": 1.0, "grad_norm": 0.014134289006535006, "kl": 0.0015106201171875, "learning_rate": 1.9618163404192651e-07, "loss": 0.0001, "num_tokens": 10737631.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8098487889736384, "frac_reward_zero_std": 1.0, "grad_norm": 0.011915398214594934, "kl": 0.0011444091796875, "learning_rate": 1.9592204295497027e-07, "loss": 0.0, "num_tokens": 10740931.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 509.0, "completions/max_terminated_length": 509.0, "completions/mean_length": 323.375, "completions/mean_terminated_length": 323.375, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8101164191087916, "frac_reward_zero_std": 1.0, "grad_norm": 0.010932687384699075, "kl": 0.0011138916015625, "learning_rate": 1.9566276085833133e-07, "loss": 0.0, "num_tokens": 10744626.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8103840492439449, "frac_reward_zero_std": 1.0, "grad_norm": 0.015792256923891117, "kl": 0.0010738372802734375, "learning_rate": 1.9540378797827495e-07, "loss": 0.0, "num_tokens": 10747600.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 422.125, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.8106516793790981, "frac_reward_zero_std": 1.0, "grad_norm": 0.012178482467425472, "kl": 0.00133514404296875, "learning_rate": 1.951451245407969e-07, "loss": 0.0001, "num_tokens": 10752021.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 406.0, "completions/max_terminated_length": 406.0, "completions/mean_length": 300.125, "completions/mean_terminated_length": 300.125, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.8109193095142513, "frac_reward_zero_std": 1.0, "grad_norm": 0.020981396137250342, "kl": 0.0020294189453125, "learning_rate": 1.9488677077162293e-07, "loss": 0.0001, "num_tokens": 10755474.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 300.375, "completions/mean_terminated_length": 300.375, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.8111869396494045, "frac_reward_zero_std": 1.0, "grad_norm": 0.02360547849605571, "kl": 0.002685546875, "learning_rate": 1.9462872689620856e-07, "loss": 0.0001, "num_tokens": 10758813.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 264.875, "completions/mean_terminated_length": 264.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8114545697845578, "frac_reward_zero_std": 1.0, "grad_norm": 0.0220829932598908, "kl": 0.00170135498046875, "learning_rate": 1.9437099313973877e-07, "loss": 0.0001, "num_tokens": 10761936.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 517.5, "completions/mean_terminated_length": 445.14288330078125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.811722199919711, "frac_reward_zero_std": 1.0, "grad_norm": 0.029025245546797983, "kl": 0.002147674560546875, "learning_rate": 1.94113569727128e-07, "loss": 0.0001, "num_tokens": 10767296.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 336.75, "completions/mean_terminated_length": 336.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.8119898300548641, "frac_reward_zero_std": 0.5, "grad_norm": 0.5949849589853975, "kl": 0.001079559326171875, "learning_rate": 1.9385645688301983e-07, "loss": 0.0, "num_tokens": 10771130.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 311.0, "completions/mean_terminated_length": 311.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.8122574601900174, "frac_reward_zero_std": 1.0, "grad_norm": 0.015136643463619428, "kl": 0.001049041748046875, "learning_rate": 1.935996548317868e-07, "loss": 0.0, "num_tokens": 10774870.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8125250903251706, "frac_reward_zero_std": 1.0, "grad_norm": 0.01885382045471375, "kl": 0.001583099365234375, "learning_rate": 1.9334316379753035e-07, "loss": 0.0001, "num_tokens": 10777509.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8127927204603238, "frac_reward_zero_std": 0.5, "grad_norm": 1.2558630999785612, "kl": 0.0013427734375, "learning_rate": 1.930869840040803e-07, "loss": 0.0435, "num_tokens": 10780758.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 239.0, "completions/max_terminated_length": 239.0, "completions/mean_length": 221.625, "completions/mean_terminated_length": 221.625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8130603505954771, "frac_reward_zero_std": 1.0, "grad_norm": 0.024555743869455894, "kl": 0.00225830078125, "learning_rate": 1.9283111567499503e-07, "loss": 0.0001, "num_tokens": 10783523.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 751.0, "completions/max_terminated_length": 751.0, "completions/mean_length": 465.375, "completions/mean_terminated_length": 465.375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8133279807306303, "frac_reward_zero_std": 0.0, "grad_norm": 1.2250880477498436, "kl": 0.00223541259765625, "learning_rate": 1.9257555903356099e-07, "loss": 0.0072, "num_tokens": 10788578.0, "reward": 0.25, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 389.0, "completions/mean_terminated_length": 389.0, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.8135956108657835, "frac_reward_zero_std": 1.0, "grad_norm": 0.013610733167158169, "kl": 0.001285552978515625, "learning_rate": 1.9232031430279288e-07, "loss": 0.0001, "num_tokens": 10792826.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 320.375, "completions/mean_terminated_length": 320.375, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8138632410009367, "frac_reward_zero_std": 1.0, "grad_norm": 0.018287973116752676, "kl": 0.0018768310546875, "learning_rate": 1.9206538170543289e-07, "loss": 0.0001, "num_tokens": 10796561.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 273.0, "completions/mean_terminated_length": 273.0, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.81413087113609, "frac_reward_zero_std": 1.0, "grad_norm": 0.01815062008289137, "kl": 0.002277374267578125, "learning_rate": 1.9181076146395097e-07, "loss": 0.0001, "num_tokens": 10799781.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 365.25, "completions/mean_terminated_length": 365.25, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8143985012712431, "frac_reward_zero_std": 1.0, "grad_norm": 0.027983358982324447, "kl": 0.00235748291015625, "learning_rate": 1.9155645380054452e-07, "loss": 0.0001, "num_tokens": 10803655.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 495.0, "completions/max_terminated_length": 495.0, "completions/mean_length": 345.375, "completions/mean_terminated_length": 345.375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8146661314063963, "frac_reward_zero_std": 1.0, "grad_norm": 0.015435123066600383, "kl": 0.001346588134765625, "learning_rate": 1.9130245893713814e-07, "loss": 0.0001, "num_tokens": 10807362.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 296.0, "completions/mean_terminated_length": 296.0, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.8149337615415496, "frac_reward_zero_std": 1.0, "grad_norm": 0.015452333994570638, "kl": 0.0010318756103515625, "learning_rate": 1.9104877709538344e-07, "loss": 0.0, "num_tokens": 10810754.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 344.125, "completions/mean_terminated_length": 344.125, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8152013916767028, "frac_reward_zero_std": 1.0, "grad_norm": 0.020546121560559853, "kl": 0.00252532958984375, "learning_rate": 1.9079540849665886e-07, "loss": 0.0001, "num_tokens": 10814591.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 433.0, "completions/max_terminated_length": 433.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.815469021811856, "frac_reward_zero_std": 1.0, "grad_norm": 0.021217874395036945, "kl": 0.00240325927734375, "learning_rate": 1.905423533620697e-07, "loss": 0.0001, "num_tokens": 10818199.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 462.125, "completions/mean_terminated_length": 462.125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8157366519470093, "frac_reward_zero_std": 1.0, "grad_norm": 0.010063741067891515, "kl": 0.00115203857421875, "learning_rate": 1.902896119124471e-07, "loss": 0.0, "num_tokens": 10823184.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 265.5, "completions/mean_terminated_length": 265.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8160042820821625, "frac_reward_zero_std": 1.0, "grad_norm": 0.03012065093711588, "kl": 0.002105712890625, "learning_rate": 1.9003718436834933e-07, "loss": 0.0001, "num_tokens": 10826288.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 299.5, "completions/mean_terminated_length": 299.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8162719122173157, "frac_reward_zero_std": 1.0, "grad_norm": 0.03545962548342627, "kl": 0.002231597900390625, "learning_rate": 1.8978507095005975e-07, "loss": 0.0001, "num_tokens": 10829768.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 419.0, "completions/max_terminated_length": 419.0, "completions/mean_length": 358.0, "completions/mean_terminated_length": 358.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.8165395423524688, "frac_reward_zero_std": 0.5, "grad_norm": 0.7466897129642781, "kl": 0.001010894775390625, "learning_rate": 1.8953327187758872e-07, "loss": 0.0038, "num_tokens": 10833836.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 476.625, "completions/mean_terminated_length": 476.625, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "epoch": 0.8168071724876221, "frac_reward_zero_std": 0.5, "grad_norm": 0.5695581904340807, "kl": 0.0016937255859375, "learning_rate": 1.892817873706713e-07, "loss": 0.0521, "num_tokens": 10838765.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 327.375, "completions/mean_terminated_length": 327.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8170748026227753, "frac_reward_zero_std": 0.5, "grad_norm": 0.7353178946573871, "kl": 0.001346588134765625, "learning_rate": 1.8903061764876832e-07, "loss": 0.0341, "num_tokens": 10842488.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 333.75, "completions/mean_terminated_length": 333.75, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8173424327579285, "frac_reward_zero_std": 0.5, "grad_norm": 1.022036323942629, "kl": 0.001270294189453125, "learning_rate": 1.8877976293106646e-07, "loss": 0.0001, "num_tokens": 10846206.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 350.875, "completions/mean_terminated_length": 350.875, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.8176100628930818, "frac_reward_zero_std": 1.0, "grad_norm": 0.02091611375954464, "kl": 0.001552581787109375, "learning_rate": 1.885292234364765e-07, "loss": 0.0001, "num_tokens": 10850081.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.817877693028235, "frac_reward_zero_std": 1.0, "grad_norm": 0.010833928936999096, "kl": 0.0007648468017578125, "learning_rate": 1.8827899938363524e-07, "loss": 0.0, "num_tokens": 10853586.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 294.5, "completions/mean_terminated_length": 294.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8181453231633882, "frac_reward_zero_std": 1.0, "grad_norm": 0.017227694716117747, "kl": 0.00139617919921875, "learning_rate": 1.8802909099090324e-07, "loss": 0.0001, "num_tokens": 10856974.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 316.5, "completions/mean_terminated_length": 316.5, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8184129532985415, "frac_reward_zero_std": 1.0, "grad_norm": 0.01783516370543102, "kl": 0.00147247314453125, "learning_rate": 1.8777949847636626e-07, "loss": 0.0001, "num_tokens": 10860698.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 388.0, "completions/mean_terminated_length": 297.14288330078125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.8186805834336947, "frac_reward_zero_std": 0.5, "grad_norm": 0.6585829150060056, "kl": 0.00225830078125, "learning_rate": 1.8753022205783406e-07, "loss": 0.1353, "num_tokens": 10865322.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 319.25, "completions/mean_terminated_length": 319.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8189482135688478, "frac_reward_zero_std": 0.5, "grad_norm": 0.6970222686930473, "kl": 0.001316070556640625, "learning_rate": 1.872812619528406e-07, "loss": 0.0148, "num_tokens": 10868928.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 461.0, "completions/max_terminated_length": 461.0, "completions/mean_length": 343.625, "completions/mean_terminated_length": 343.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.819215843704001, "frac_reward_zero_std": 1.0, "grad_norm": 0.015831812921767843, "kl": 0.00138092041015625, "learning_rate": 1.87032618378644e-07, "loss": 0.0001, "num_tokens": 10872881.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 360.5, "completions/mean_terminated_length": 360.5, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8194834738391543, "frac_reward_zero_std": 1.0, "grad_norm": 0.02639050687719949, "kl": 0.001667022705078125, "learning_rate": 1.8678429155222592e-07, "loss": 0.0001, "num_tokens": 10877097.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 322.125, "completions/mean_terminated_length": 322.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8197511039743075, "frac_reward_zero_std": 1.0, "grad_norm": 0.008500657054234022, "kl": 0.0005779266357421875, "learning_rate": 1.8653628169029173e-07, "loss": 0.0, "num_tokens": 10880602.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 284.125, "completions/mean_terminated_length": 284.125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.8200187341094607, "frac_reward_zero_std": 1.0, "grad_norm": 0.01386219916473452, "kl": 0.001483917236328125, "learning_rate": 1.862885890092701e-07, "loss": 0.0001, "num_tokens": 10883907.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 308.0, "completions/mean_terminated_length": 308.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.820286364244614, "frac_reward_zero_std": 0.5, "grad_norm": 0.7653669794875019, "kl": 0.002178192138671875, "learning_rate": 1.8604121372531312e-07, "loss": 0.0718, "num_tokens": 10887347.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 403.625, "completions/mean_terminated_length": 403.625, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.8205539943797672, "frac_reward_zero_std": 1.0, "grad_norm": 0.017960892639142616, "kl": 0.001857757568359375, "learning_rate": 1.8579415605429567e-07, "loss": 0.0001, "num_tokens": 10891704.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 299.125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.8208216245149204, "frac_reward_zero_std": 1.0, "grad_norm": 0.014639932872012719, "kl": 0.000904083251953125, "learning_rate": 1.855474162118155e-07, "loss": 0.0, "num_tokens": 10895353.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.8210892546500737, "frac_reward_zero_std": 1.0, "grad_norm": 0.017329652702927696, "kl": 0.0008678436279296875, "learning_rate": 1.8530099441319313e-07, "loss": 0.0, "num_tokens": 10898179.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 413.0, "completions/max_terminated_length": 413.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8213568847852268, "frac_reward_zero_std": 1.0, "grad_norm": 0.014584804107967357, "kl": 0.001232147216796875, "learning_rate": 1.8505489087347148e-07, "loss": 0.0, "num_tokens": 10901415.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.82162451492038, "frac_reward_zero_std": 1.0, "grad_norm": 0.023990326915814087, "kl": 0.00220489501953125, "learning_rate": 1.8480910580741566e-07, "loss": 0.0001, "num_tokens": 10904393.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 250.375, "completions/mean_terminated_length": 250.375, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8218921450555332, "frac_reward_zero_std": 1.0, "grad_norm": 0.08302870170764286, "kl": 0.0035247802734375, "learning_rate": 1.8456363942951304e-07, "loss": 0.0001, "num_tokens": 10907268.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 327.5, "completions/mean_terminated_length": 327.5, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8221597751906865, "frac_reward_zero_std": 1.0, "grad_norm": 0.027041253222507396, "kl": 0.0018463134765625, "learning_rate": 1.8431849195397237e-07, "loss": 0.0001, "num_tokens": 10910992.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 282.125, "completions/mean_terminated_length": 282.125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8224274053258397, "frac_reward_zero_std": 1.0, "grad_norm": 0.010464248815345694, "kl": 0.0008258819580078125, "learning_rate": 1.8407366359472492e-07, "loss": 0.0, "num_tokens": 10914233.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 401.125, "completions/mean_terminated_length": 401.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8226950354609929, "frac_reward_zero_std": 0.5, "grad_norm": 0.9004696206013036, "kl": 0.00279998779296875, "learning_rate": 1.8382915456542292e-07, "loss": -0.0033, "num_tokens": 10918710.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8229626655961462, "frac_reward_zero_std": 1.0, "grad_norm": 0.02473578170140603, "kl": 0.00174713134765625, "learning_rate": 1.8358496507944004e-07, "loss": 0.0001, "num_tokens": 10921856.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 474.5, "completions/mean_terminated_length": 474.5, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8232302957312994, "frac_reward_zero_std": 0.5, "grad_norm": 0.6517321972444108, "kl": 0.001125335693359375, "learning_rate": 1.8334109534987122e-07, "loss": 0.0893, "num_tokens": 10926792.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8234979258664525, "frac_reward_zero_std": 1.0, "grad_norm": 0.014672893107794552, "kl": 0.001346588134765625, "learning_rate": 1.8309754558953197e-07, "loss": 0.0001, "num_tokens": 10930615.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 341.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8237655560016057, "frac_reward_zero_std": 1.0, "grad_norm": 0.01123442942884751, "kl": 0.0012149810791015625, "learning_rate": 1.8285431601095932e-07, "loss": 0.0, "num_tokens": 10934492.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 348.25, "completions/mean_terminated_length": 348.25, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.824033186136759, "frac_reward_zero_std": 1.0, "grad_norm": 0.01603480630873504, "kl": 0.001308441162109375, "learning_rate": 1.8261140682640996e-07, "loss": 0.0001, "num_tokens": 10938542.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 349.375, "completions/mean_terminated_length": 349.375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8243008162719122, "frac_reward_zero_std": 1.0, "grad_norm": 0.0282114410364506, "kl": 0.0025482177734375, "learning_rate": 1.8236881824786192e-07, "loss": 0.0001, "num_tokens": 10942433.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 220.0, "completions/mean_terminated_length": 220.0, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.8245684464070654, "frac_reward_zero_std": 1.0, "grad_norm": 0.023373783651452006, "kl": 0.002685546875, "learning_rate": 1.8212655048701264e-07, "loss": 0.0001, "num_tokens": 10945077.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 220.25, "completions/mean_terminated_length": 220.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8248360765422187, "frac_reward_zero_std": 1.0, "grad_norm": 0.012936524231963616, "kl": 0.001110076904296875, "learning_rate": 1.818846037552799e-07, "loss": 0.0, "num_tokens": 10947883.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 297.0, "completions/max_terminated_length": 297.0, "completions/mean_length": 205.125, "completions/mean_terminated_length": 205.125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8251037066773719, "frac_reward_zero_std": 1.0, "grad_norm": 0.02283588347628914, "kl": 0.002353668212890625, "learning_rate": 1.8164297826380185e-07, "loss": 0.0001, "num_tokens": 10950484.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 295.125, "completions/mean_terminated_length": 295.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8253713368125251, "frac_reward_zero_std": 1.0, "grad_norm": 0.011594565137187988, "kl": 0.000942230224609375, "learning_rate": 1.8140167422343533e-07, "loss": 0.0, "num_tokens": 10953921.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 202.875, "completions/mean_terminated_length": 202.875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.8256389669476784, "frac_reward_zero_std": 1.0, "grad_norm": 0.0424000944562205, "kl": 0.00254058837890625, "learning_rate": 1.8116069184475759e-07, "loss": 0.0001, "num_tokens": 10956488.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 355.125, "completions/mean_terminated_length": 355.125, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.8259065970828315, "frac_reward_zero_std": 0.5, "grad_norm": 0.9013118136196545, "kl": 0.00154876708984375, "learning_rate": 1.8092003133806434e-07, "loss": 0.0001, "num_tokens": 10960505.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 411.75, "completions/mean_terminated_length": 411.75, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.8261742272179847, "frac_reward_zero_std": 1.0, "grad_norm": 0.01184714278523583, "kl": 0.000621795654296875, "learning_rate": 1.806796929133711e-07, "loss": 0.0, "num_tokens": 10965035.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.8264418573531379, "frac_reward_zero_std": 1.0, "grad_norm": 0.020688567347464666, "kl": 0.00116729736328125, "learning_rate": 1.8043967678041188e-07, "loss": 0.0, "num_tokens": 10968249.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 344.875, "completions/mean_terminated_length": 344.875, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.8267094874882912, "frac_reward_zero_std": 0.5, "grad_norm": 1.026207533018394, "kl": 0.001556396484375, "learning_rate": 1.801999831486397e-07, "loss": 0.0578, "num_tokens": 10972056.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 277.75, "completions/mean_terminated_length": 277.75, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8269771176234444, "frac_reward_zero_std": 1.0, "grad_norm": 0.01477566736241627, "kl": 0.001415252685546875, "learning_rate": 1.7996061222722604e-07, "loss": 0.0001, "num_tokens": 10975234.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.8272447477585976, "frac_reward_zero_std": 1.0, "grad_norm": 0.014942247871281329, "kl": 0.00130462646484375, "learning_rate": 1.7972156422506063e-07, "loss": 0.0001, "num_tokens": 10978750.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 354.625, "completions/mean_terminated_length": 354.625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.8275123778937509, "frac_reward_zero_std": 0.5, "grad_norm": 1.053829260329586, "kl": 0.00141143798828125, "learning_rate": 1.7948283935075176e-07, "loss": 0.0041, "num_tokens": 10982683.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 574.0, "completions/max_terminated_length": 574.0, "completions/mean_length": 423.375, "completions/mean_terminated_length": 423.375, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.8277800080289041, "frac_reward_zero_std": 0.5, "grad_norm": 0.6997118361229864, "kl": 0.0011959075927734375, "learning_rate": 1.792444378126254e-07, "loss": -0.0228, "num_tokens": 10987106.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 329.0, "completions/mean_terminated_length": 329.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.8280476381640572, "frac_reward_zero_std": 1.0, "grad_norm": 0.010826541623185778, "kl": 0.001201629638671875, "learning_rate": 1.790063598187254e-07, "loss": 0.0, "num_tokens": 10990698.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 290.0, "completions/max_terminated_length": 290.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 242.0, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8283152682992105, "frac_reward_zero_std": 1.0, "grad_norm": 0.015521610784592178, "kl": 0.00150299072265625, "learning_rate": 1.7876860557681345e-07, "loss": 0.0001, "num_tokens": 10993674.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 540.0, "completions/max_terminated_length": 540.0, "completions/mean_length": 398.75, "completions/mean_terminated_length": 398.75, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "epoch": 0.8285828984343637, "frac_reward_zero_std": 0.0, "grad_norm": 1.0205334178266428, "kl": 0.00490570068359375, "learning_rate": 1.785311752943685e-07, "loss": 0.048, "num_tokens": 10998232.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 339.75, "completions/mean_terminated_length": 339.75, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8288505285695169, "frac_reward_zero_std": 1.0, "grad_norm": 0.013771942912172515, "kl": 0.001277923583984375, "learning_rate": 1.7829406917858698e-07, "loss": 0.0001, "num_tokens": 11002118.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 257.25, "completions/mean_terminated_length": 257.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8291181587046701, "frac_reward_zero_std": 1.0, "grad_norm": 0.03710070679232757, "kl": 0.0025787353515625, "learning_rate": 1.7805728743638232e-07, "loss": 0.0001, "num_tokens": 11005276.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 267.0, "completions/max_terminated_length": 267.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.8293857888398234, "frac_reward_zero_std": 1.0, "grad_norm": 0.014581293882872463, "kl": 0.0010242462158203125, "learning_rate": 1.7782083027438495e-07, "loss": 0.0, "num_tokens": 11008317.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.8296534189749766, "frac_reward_zero_std": 1.0, "grad_norm": 0.022785323759682688, "kl": 0.0016326904296875, "learning_rate": 1.77584697898942e-07, "loss": 0.0001, "num_tokens": 11011305.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8299210491101298, "frac_reward_zero_std": 1.0, "grad_norm": 0.014652517812722355, "kl": 0.001537322998046875, "learning_rate": 1.7734889051611695e-07, "loss": 0.0001, "num_tokens": 11014172.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 234.75, "completions/mean_terminated_length": 234.75, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.8301886792452831, "frac_reward_zero_std": 1.0, "grad_norm": 0.009725603974695473, "kl": 0.00067138671875, "learning_rate": 1.7711340833169024e-07, "loss": 0.0, "num_tokens": 11017026.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 299.0, "completions/max_terminated_length": 299.0, "completions/mean_length": 210.5, "completions/mean_terminated_length": 210.5, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8304563093804362, "frac_reward_zero_std": 1.0, "grad_norm": 0.03755307601890661, "kl": 0.002532958984375, "learning_rate": 1.7687825155115797e-07, "loss": 0.0001, "num_tokens": 11019818.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 362.0, "completions/max_terminated_length": 362.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.8307239395155894, "frac_reward_zero_std": 1.0, "grad_norm": 0.014339192840592646, "kl": 0.00133514404296875, "learning_rate": 1.7664342037973262e-07, "loss": 0.0001, "num_tokens": 11023281.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 637.0, "completions/max_terminated_length": 637.0, "completions/mean_length": 412.125, "completions/mean_terminated_length": 412.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.8309915696507427, "frac_reward_zero_std": 0.5, "grad_norm": 0.6024990893226638, "kl": 0.00148773193359375, "learning_rate": 1.7640891502234238e-07, "loss": 0.0001, "num_tokens": 11027670.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8312591997858959, "frac_reward_zero_std": 1.0, "grad_norm": 0.02169943427964067, "kl": 0.001911163330078125, "learning_rate": 1.7617473568363094e-07, "loss": 0.0001, "num_tokens": 11030763.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 341.875, "completions/mean_terminated_length": 341.875, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.8315268299210491, "frac_reward_zero_std": 1.0, "grad_norm": 0.009099409221936852, "kl": 0.0005855560302734375, "learning_rate": 1.7594088256795798e-07, "loss": 0.0, "num_tokens": 11034706.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8317944600562023, "frac_reward_zero_std": 1.0, "grad_norm": 0.030271920721349058, "kl": 0.002124786376953125, "learning_rate": 1.7570735587939772e-07, "loss": 0.0001, "num_tokens": 11037535.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 210.0, "completions/max_terminated_length": 210.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8320620901913556, "frac_reward_zero_std": 1.0, "grad_norm": 0.024951980596662676, "kl": 0.001598358154296875, "learning_rate": 1.7547415582174055e-07, "loss": 0.0001, "num_tokens": 11039971.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 420.0, "completions/mean_terminated_length": 420.0, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.8323297203265088, "frac_reward_zero_std": 1.0, "grad_norm": 0.016648930664787134, "kl": 0.001117706298828125, "learning_rate": 1.7524128259849087e-07, "loss": 0.0, "num_tokens": 11044571.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 420.0, "completions/max_terminated_length": 420.0, "completions/mean_length": 268.375, "completions/mean_terminated_length": 268.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.832597350461662, "frac_reward_zero_std": 1.0, "grad_norm": 0.021940445071611307, "kl": 0.00185394287109375, "learning_rate": 1.7500873641286823e-07, "loss": 0.0001, "num_tokens": 11047718.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 296.25, "completions/mean_terminated_length": 296.25, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.8328649805968152, "frac_reward_zero_std": 1.0, "grad_norm": 0.009866830882542019, "kl": 0.0006465911865234375, "learning_rate": 1.7477651746780716e-07, "loss": 0.0, "num_tokens": 11051224.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8331326107319684, "frac_reward_zero_std": 1.0, "grad_norm": 0.01231739433409359, "kl": 0.00107574462890625, "learning_rate": 1.7454462596595587e-07, "loss": 0.0, "num_tokens": 11054118.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8334002408671216, "frac_reward_zero_std": 1.0, "grad_norm": 0.031005589018409007, "kl": 0.0016937255859375, "learning_rate": 1.7431306210967758e-07, "loss": 0.0001, "num_tokens": 11056829.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 635.125, "completions/mean_terminated_length": 579.5714721679688, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 0.8336678710022749, "frac_reward_zero_std": 0.5, "grad_norm": 0.7047565701912836, "kl": 0.00146484375, "learning_rate": 1.7408182610104882e-07, "loss": 0.142, "num_tokens": 11063318.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8339355011374281, "frac_reward_zero_std": 1.0, "grad_norm": 0.01964556396084651, "kl": 0.00130462646484375, "learning_rate": 1.7385091814186093e-07, "loss": 0.0001, "num_tokens": 11066618.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 336.625, "completions/mean_terminated_length": 336.625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.8342031312725813, "frac_reward_zero_std": 1.0, "grad_norm": 0.013050165321398985, "kl": 0.000843048095703125, "learning_rate": 1.7362033843361805e-07, "loss": 0.0, "num_tokens": 11070335.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 319.625, "completions/mean_terminated_length": 319.625, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8344707614077345, "frac_reward_zero_std": 1.0, "grad_norm": 0.03957894901848633, "kl": 0.00201416015625, "learning_rate": 1.7339008717753844e-07, "loss": 0.0001, "num_tokens": 11074080.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 348.625, "completions/mean_terminated_length": 348.625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8347383915428878, "frac_reward_zero_std": 1.0, "grad_norm": 0.6131057906728575, "kl": 0.00371551513671875, "learning_rate": 1.7316016457455363e-07, "loss": 0.0001, "num_tokens": 11078049.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 351.375, "completions/mean_terminated_length": 351.375, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.8350060216780409, "frac_reward_zero_std": 1.0, "grad_norm": 0.014057610838372347, "kl": 0.00119781494140625, "learning_rate": 1.7293057082530822e-07, "loss": 0.0, "num_tokens": 11081764.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 200.75, "completions/mean_terminated_length": 200.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8352736518131941, "frac_reward_zero_std": 1.0, "grad_norm": 0.010930451034626795, "kl": 0.0005178451538085938, "learning_rate": 1.7270130613015993e-07, "loss": 0.0, "num_tokens": 11084198.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 285.0, "completions/mean_terminated_length": 285.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.8355412819483474, "frac_reward_zero_std": 1.0, "grad_norm": 0.01669339241176147, "kl": 0.00156402587890625, "learning_rate": 1.7247237068917936e-07, "loss": 0.0001, "num_tokens": 11087614.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.8358089120835006, "frac_reward_zero_std": 1.0, "grad_norm": 0.011050976327956147, "kl": 0.0007076263427734375, "learning_rate": 1.7224376470214966e-07, "loss": 0.0, "num_tokens": 11091354.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.8360765422186538, "frac_reward_zero_std": 1.0, "grad_norm": 0.029891333664481125, "kl": 0.00146484375, "learning_rate": 1.7201548836856655e-07, "loss": 0.0001, "num_tokens": 11094917.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 271.25, "completions/mean_terminated_length": 271.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8363441723538071, "frac_reward_zero_std": 1.0, "grad_norm": 0.027162616685892493, "kl": 0.00211334228515625, "learning_rate": 1.717875418876381e-07, "loss": 0.0001, "num_tokens": 11098267.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8366118024889603, "frac_reward_zero_std": 1.0, "grad_norm": 0.012380051385614198, "kl": 0.0008411407470703125, "learning_rate": 1.7155992545828458e-07, "loss": 0.0, "num_tokens": 11101537.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 250.5, "completions/mean_terminated_length": 250.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.8368794326241135, "frac_reward_zero_std": 1.0, "grad_norm": 0.015806695350363355, "kl": 0.001522064208984375, "learning_rate": 1.71332639279138e-07, "loss": 0.0001, "num_tokens": 11104625.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 324.375, "completions/mean_terminated_length": 324.375, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.8371470627592666, "frac_reward_zero_std": 1.0, "grad_norm": 0.03363839422501254, "kl": 0.001789093017578125, "learning_rate": 1.7110568354854247e-07, "loss": 0.0001, "num_tokens": 11108308.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 359.75, "completions/mean_terminated_length": 359.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.8374146928944199, "frac_reward_zero_std": 0.5, "grad_norm": 0.6218603510831873, "kl": 0.0015716552734375, "learning_rate": 1.708790584645536e-07, "loss": 0.0607, "num_tokens": 11112346.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 282.625, "completions/mean_terminated_length": 282.625, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8376823230295731, "frac_reward_zero_std": 1.0, "grad_norm": 0.028700665137378466, "kl": 0.001506805419921875, "learning_rate": 1.706527642249382e-07, "loss": 0.0001, "num_tokens": 11115959.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 385.0, "completions/mean_terminated_length": 385.0, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8379499531647263, "frac_reward_zero_std": 1.0, "grad_norm": 0.03390315476451109, "kl": 0.00196075439453125, "learning_rate": 1.7042680102717495e-07, "loss": 0.0001, "num_tokens": 11120059.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.8382175832998796, "frac_reward_zero_std": 0.5, "grad_norm": 0.9492775780451403, "kl": 0.00298309326171875, "learning_rate": 1.702011690684531e-07, "loss": 0.1249, "num_tokens": 11124031.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 211.375, "completions/mean_terminated_length": 211.375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8384852134350328, "frac_reward_zero_std": 1.0, "grad_norm": 0.018526678820332706, "kl": 0.0010890960693359375, "learning_rate": 1.6997586854567314e-07, "loss": 0.0, "num_tokens": 11126834.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 305.0, "completions/max_terminated_length": 305.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.838752843570186, "frac_reward_zero_std": 1.0, "grad_norm": 0.01996318417724413, "kl": 0.00229644775390625, "learning_rate": 1.6975089965544624e-07, "loss": 0.0001, "num_tokens": 11130144.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 269.0, "completions/mean_terminated_length": 269.0, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8390204737053393, "frac_reward_zero_std": 1.0, "grad_norm": 0.010504447311741922, "kl": 0.0007495880126953125, "learning_rate": 1.6952626259409404e-07, "loss": 0.0, "num_tokens": 11133252.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 272.75, "completions/mean_terminated_length": 272.75, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8392881038404925, "frac_reward_zero_std": 1.0, "grad_norm": 0.026698166404688303, "kl": 0.00157928466796875, "learning_rate": 1.693019575576489e-07, "loss": 0.0001, "num_tokens": 11136410.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 296.5, "completions/mean_terminated_length": 296.5, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.8395557339756456, "frac_reward_zero_std": 1.0, "grad_norm": 0.030275975631353255, "kl": 0.001789093017578125, "learning_rate": 1.6907798474185302e-07, "loss": 0.0001, "num_tokens": 11139730.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 285.5, "completions/mean_terminated_length": 285.5, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8398233641107988, "frac_reward_zero_std": 1.0, "grad_norm": 0.011952047473959317, "kl": 0.00089263916015625, "learning_rate": 1.688543443421593e-07, "loss": 0.0, "num_tokens": 11142958.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 378.0, "completions/max_terminated_length": 378.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.8400909942459521, "frac_reward_zero_std": 0.5, "grad_norm": 1.332649769853833, "kl": 0.002094268798828125, "learning_rate": 1.6863103655372973e-07, "loss": 0.0564, "num_tokens": 11146478.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 353.25, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8403586243811053, "frac_reward_zero_std": 1.0, "grad_norm": 0.014825580759675815, "kl": 0.001377105712890625, "learning_rate": 1.6840806157143687e-07, "loss": 0.0001, "num_tokens": 11150328.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 278.875, "completions/mean_terminated_length": 278.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.8406262545162585, "frac_reward_zero_std": 1.0, "grad_norm": 0.018179077716662165, "kl": 0.001983642578125, "learning_rate": 1.681854195898624e-07, "loss": 0.0001, "num_tokens": 11153487.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8408938846514118, "frac_reward_zero_std": 1.0, "grad_norm": 0.03130427897624839, "kl": 0.0023956298828125, "learning_rate": 1.6796311080329718e-07, "loss": 0.0001, "num_tokens": 11157471.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 318.625, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.841161514786565, "frac_reward_zero_std": 1.0, "grad_norm": 0.019805651567048426, "kl": 0.001506805419921875, "learning_rate": 1.677411354057421e-07, "loss": 0.0001, "num_tokens": 11161168.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 405.0, "completions/max_terminated_length": 405.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.8414291449217182, "frac_reward_zero_std": 1.0, "grad_norm": 0.03176179598946277, "kl": 0.00183868408203125, "learning_rate": 1.675194935909061e-07, "loss": 0.0001, "num_tokens": 11164237.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 341.375, "completions/mean_terminated_length": 341.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8416967750568713, "frac_reward_zero_std": 0.5, "grad_norm": 0.7615917325488188, "kl": 0.002532958984375, "learning_rate": 1.67298185552208e-07, "loss": 0.0632, "num_tokens": 11168040.0, "reward": 0.375, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 304.25, "completions/mean_terminated_length": 304.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8419644051920246, "frac_reward_zero_std": 1.0, "grad_norm": 0.01520101235006912, "kl": 0.00128936767578125, "learning_rate": 1.6707721148277453e-07, "loss": 0.0001, "num_tokens": 11171470.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 266.125, "completions/mean_terminated_length": 266.125, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.8422320353271778, "frac_reward_zero_std": 0.5, "grad_norm": 1.2712214079352264, "kl": 0.001163482666015625, "learning_rate": 1.6685657157544152e-07, "loss": -0.0203, "num_tokens": 11174547.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.842499665462331, "frac_reward_zero_std": 1.0, "grad_norm": 0.021451621788161564, "kl": 0.00151824951171875, "learning_rate": 1.666362660227529e-07, "loss": 0.0001, "num_tokens": 11177540.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 206.375, "completions/mean_terminated_length": 206.375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8427672955974843, "frac_reward_zero_std": 1.0, "grad_norm": 0.021742490289873496, "kl": 0.001789093017578125, "learning_rate": 1.6641629501696087e-07, "loss": 0.0001, "num_tokens": 11180155.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8430349257326375, "frac_reward_zero_std": 1.0, "grad_norm": 0.015013843538352951, "kl": 0.0011157989501953125, "learning_rate": 1.6619665875002588e-07, "loss": 0.0, "num_tokens": 11183856.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 749.0, "completions/max_terminated_length": 749.0, "completions/mean_length": 477.125, "completions/mean_terminated_length": 477.125, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "epoch": 0.8433025558677907, "frac_reward_zero_std": 1.0, "grad_norm": 0.013203027087737476, "kl": 0.001300811767578125, "learning_rate": 1.65977357413616e-07, "loss": 0.0001, "num_tokens": 11188833.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 293.75, "completions/mean_terminated_length": 293.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.843570186002944, "frac_reward_zero_std": 1.0, "grad_norm": 0.011536567070772845, "kl": 0.001102447509765625, "learning_rate": 1.657583911991072e-07, "loss": 0.0, "num_tokens": 11192327.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 567.0, "completions/max_terminated_length": 567.0, "completions/mean_length": 344.625, "completions/mean_terminated_length": 344.625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8438378161380972, "frac_reward_zero_std": 1.0, "grad_norm": 0.015543393954724882, "kl": 0.0010051727294921875, "learning_rate": 1.6553976029758289e-07, "loss": 0.0, "num_tokens": 11196164.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 379.5, "completions/mean_terminated_length": 379.5, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.8441054462732503, "frac_reward_zero_std": 1.0, "grad_norm": 0.01883146799712487, "kl": 0.0017108917236328125, "learning_rate": 1.6532146489983395e-07, "loss": 0.0001, "num_tokens": 11200176.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.8443730764084035, "frac_reward_zero_std": 1.0, "grad_norm": 0.011353825827520437, "kl": 0.0007476806640625, "learning_rate": 1.6510350519635846e-07, "loss": 0.0, "num_tokens": 11203116.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 316.0, "completions/max_terminated_length": 316.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8446407065435568, "frac_reward_zero_std": 1.0, "grad_norm": 0.016506441238155572, "kl": 0.001438140869140625, "learning_rate": 1.6488588137736141e-07, "loss": 0.0001, "num_tokens": 11205876.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 312.75, "completions/mean_terminated_length": 312.75, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.84490833667871, "frac_reward_zero_std": 1.0, "grad_norm": 0.014931159942932672, "kl": 0.001384735107421875, "learning_rate": 1.6466859363275493e-07, "loss": 0.0001, "num_tokens": 11209726.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 287.375, "completions/mean_terminated_length": 287.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.8451759668138632, "frac_reward_zero_std": 1.0, "grad_norm": 0.018604445643421362, "kl": 0.00140380859375, "learning_rate": 1.6445164215215774e-07, "loss": 0.0001, "num_tokens": 11213057.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.8454435969490165, "frac_reward_zero_std": 1.0, "grad_norm": 0.014146399251883728, "kl": 0.001125335693359375, "learning_rate": 1.64235027124895e-07, "loss": 0.0, "num_tokens": 11216241.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 285.625, "completions/mean_terminated_length": 285.625, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.8457112270841697, "frac_reward_zero_std": 1.0, "grad_norm": 0.02049963198671304, "kl": 0.00157928466796875, "learning_rate": 1.640187487399985e-07, "loss": 0.0001, "num_tokens": 11219598.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 462.0, "completions/max_terminated_length": 462.0, "completions/mean_length": 337.875, "completions/mean_terminated_length": 337.875, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8459788572193229, "frac_reward_zero_std": 1.0, "grad_norm": 0.012740339027711252, "kl": 0.0009212493896484375, "learning_rate": 1.6380280718620605e-07, "loss": 0.0, "num_tokens": 11223249.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 251.875, "completions/mean_terminated_length": 251.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8462464873544762, "frac_reward_zero_std": 1.0, "grad_norm": 0.018964472271205325, "kl": 0.001735687255859375, "learning_rate": 1.6358720265196164e-07, "loss": 0.0001, "num_tokens": 11226276.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.8465141174896293, "frac_reward_zero_std": 0.5, "grad_norm": 0.9651806425145094, "kl": 0.0020751953125, "learning_rate": 1.6337193532541504e-07, "loss": -0.0025, "num_tokens": 11229588.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 401.625, "completions/mean_terminated_length": 401.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8467817476247825, "frac_reward_zero_std": 1.0, "grad_norm": 0.01503409453647602, "kl": 0.00144195556640625, "learning_rate": 1.63157005394422e-07, "loss": 0.0001, "num_tokens": 11234129.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 284.0, "completions/max_terminated_length": 284.0, "completions/mean_length": 213.0, "completions/mean_terminated_length": 213.0, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8470493777599357, "frac_reward_zero_std": 1.0, "grad_norm": 0.01944951798031985, "kl": 0.001247406005859375, "learning_rate": 1.629424130465436e-07, "loss": 0.0, "num_tokens": 11236705.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 401.0, "completions/mean_terminated_length": 401.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.847317007895089, "frac_reward_zero_std": 0.5, "grad_norm": 0.8096824796510612, "kl": 0.002033233642578125, "learning_rate": 1.627281584690462e-07, "loss": -0.0139, "num_tokens": 11241293.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 229.0, "completions/max_terminated_length": 229.0, "completions/mean_length": 204.875, "completions/mean_terminated_length": 204.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8475846380302422, "frac_reward_zero_std": 1.0, "grad_norm": 0.02781377438179788, "kl": 0.0012969970703125, "learning_rate": 1.6251424184890195e-07, "loss": 0.0001, "num_tokens": 11243944.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 232.75, "completions/mean_terminated_length": 232.75, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8478522681653954, "frac_reward_zero_std": 1.0, "grad_norm": 0.026740133852806432, "kl": 0.001758575439453125, "learning_rate": 1.6230066337278723e-07, "loss": 0.0001, "num_tokens": 11246838.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 316.0, "completions/mean_terminated_length": 316.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8481198983005487, "frac_reward_zero_std": 1.0, "grad_norm": 0.020154518630245637, "kl": 0.00128173828125, "learning_rate": 1.6208742322708436e-07, "loss": 0.0001, "num_tokens": 11250446.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 224.25, "completions/mean_terminated_length": 224.25, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8483875284357019, "frac_reward_zero_std": 1.0, "grad_norm": 0.017327489815303757, "kl": 0.001132965087890625, "learning_rate": 1.618745215978795e-07, "loss": 0.0, "num_tokens": 11253256.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 329.625, "completions/mean_terminated_length": 329.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.848655158570855, "frac_reward_zero_std": 1.0, "grad_norm": 0.023301533566471393, "kl": 0.0021514892578125, "learning_rate": 1.616619586709638e-07, "loss": 0.0001, "num_tokens": 11256909.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8489227887060083, "frac_reward_zero_std": 1.0, "grad_norm": 0.05798930790884639, "kl": 0.0023651123046875, "learning_rate": 1.6144973463183308e-07, "loss": 0.0001, "num_tokens": 11260098.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 413.5, "completions/mean_terminated_length": 413.5, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "epoch": 0.8491904188411615, "frac_reward_zero_std": 1.0, "grad_norm": 0.015178525784804086, "kl": 0.00150299072265625, "learning_rate": 1.6123784966568686e-07, "loss": 0.0001, "num_tokens": 11264462.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 228.125, "completions/mean_terminated_length": 228.125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.8494580489763147, "frac_reward_zero_std": 1.0, "grad_norm": 0.01179828080695206, "kl": 0.000926971435546875, "learning_rate": 1.6102630395742933e-07, "loss": 0.0, "num_tokens": 11267123.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 260.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8497256791114679, "frac_reward_zero_std": 1.0, "grad_norm": 0.01746925280794193, "kl": 0.000972747802734375, "learning_rate": 1.6081509769166808e-07, "loss": 0.0, "num_tokens": 11270123.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 380.375, "completions/mean_terminated_length": 380.375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8499933092466212, "frac_reward_zero_std": 1.0, "grad_norm": 0.008380206127936355, "kl": 0.000782012939453125, "learning_rate": 1.6060423105271495e-07, "loss": 0.0, "num_tokens": 11274158.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8502609393817744, "frac_reward_zero_std": 1.0, "grad_norm": 0.012561918380346425, "kl": 0.0010395050048828125, "learning_rate": 1.603937042245851e-07, "loss": 0.0, "num_tokens": 11277681.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 267.375, "completions/mean_terminated_length": 267.375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.8505285695169276, "frac_reward_zero_std": 1.0, "grad_norm": 0.019201410330631104, "kl": 0.0018310546875, "learning_rate": 1.6018351739099734e-07, "loss": 0.0001, "num_tokens": 11280928.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 364.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8507961996520809, "frac_reward_zero_std": 1.0, "grad_norm": 0.013510089953720881, "kl": 0.001422882080078125, "learning_rate": 1.5997367073537364e-07, "loss": 0.0001, "num_tokens": 11284971.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 249.375, "completions/mean_terminated_length": 249.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.851063829787234, "frac_reward_zero_std": 1.0, "grad_norm": 0.021744035308421065, "kl": 0.0013561248779296875, "learning_rate": 1.5976416444083918e-07, "loss": 0.0001, "num_tokens": 11287982.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 601.0, "completions/max_terminated_length": 601.0, "completions/mean_length": 341.5, "completions/mean_terminated_length": 341.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8513314599223872, "frac_reward_zero_std": 0.5, "grad_norm": 0.6940408619618125, "kl": 0.0014801025390625, "learning_rate": 1.5955499869022213e-07, "loss": 0.004, "num_tokens": 11291774.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 331.0, "completions/mean_terminated_length": 331.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.8515990900575405, "frac_reward_zero_std": 1.0, "grad_norm": 0.010713303036964043, "kl": 0.001125335693359375, "learning_rate": 1.5934617366605337e-07, "loss": 0.0, "num_tokens": 11295410.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 377.0, "completions/max_terminated_length": 377.0, "completions/mean_length": 293.75, "completions/mean_terminated_length": 293.75, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.8518667201926937, "frac_reward_zero_std": 1.0, "grad_norm": 0.020039015552670455, "kl": 0.002300262451171875, "learning_rate": 1.5913768955056667e-07, "loss": 0.0001, "num_tokens": 11298816.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 248.0, "completions/max_terminated_length": 248.0, "completions/mean_length": 192.875, "completions/mean_terminated_length": 192.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8521343503278469, "frac_reward_zero_std": 1.0, "grad_norm": 0.017267131099223742, "kl": 0.00135040283203125, "learning_rate": 1.58929546525698e-07, "loss": 0.0001, "num_tokens": 11301335.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8524019804630001, "frac_reward_zero_std": 1.0, "grad_norm": 0.027673374657601897, "kl": 0.0013370513916015625, "learning_rate": 1.58721744773086e-07, "loss": 0.0001, "num_tokens": 11304596.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 341.0, "completions/max_terminated_length": 341.0, "completions/mean_length": 265.0, "completions/mean_terminated_length": 265.0, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8526696105981534, "frac_reward_zero_std": 1.0, "grad_norm": 0.015370411498651117, "kl": 0.0013885498046875, "learning_rate": 1.585142844740712e-07, "loss": 0.0001, "num_tokens": 11307628.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 505.0, "completions/max_terminated_length": 505.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.8529372407333066, "frac_reward_zero_std": 0.5, "grad_norm": 0.6856766198063337, "kl": 0.00225830078125, "learning_rate": 1.583071658096963e-07, "loss": 0.0686, "num_tokens": 11311623.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 380.0, "completions/max_terminated_length": 380.0, "completions/mean_length": 256.25, "completions/mean_terminated_length": 256.25, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.8532048708684598, "frac_reward_zero_std": 1.0, "grad_norm": 0.02626218843459148, "kl": 0.00152587890625, "learning_rate": 1.5810038896070588e-07, "loss": 0.0001, "num_tokens": 11314901.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 229.75, "completions/mean_terminated_length": 229.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.853472501003613, "frac_reward_zero_std": 1.0, "grad_norm": 0.027348639628284706, "kl": 0.00182342529296875, "learning_rate": 1.5789395410754625e-07, "loss": 0.0001, "num_tokens": 11317619.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 398.625, "completions/mean_terminated_length": 398.625, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.8537401311387662, "frac_reward_zero_std": 0.5, "grad_norm": 1.1278097990274245, "kl": 0.0020904541015625, "learning_rate": 1.576878614303651e-07, "loss": -0.0521, "num_tokens": 11322160.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 419.875, "completions/mean_terminated_length": 419.875, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8540077612739194, "frac_reward_zero_std": 1.0, "grad_norm": 0.015262249124277903, "kl": 0.00156402587890625, "learning_rate": 1.5748211110901178e-07, "loss": 0.0001, "num_tokens": 11326651.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 325.875, "completions/mean_terminated_length": 325.875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8542753914090727, "frac_reward_zero_std": 1.0, "grad_norm": 0.022125379612966803, "kl": 0.00225830078125, "learning_rate": 1.5727670332303663e-07, "loss": 0.0001, "num_tokens": 11330362.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 302.375, "completions/mean_terminated_length": 302.375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8545430215442259, "frac_reward_zero_std": 1.0, "grad_norm": 0.018118919630721618, "kl": 0.001415252685546875, "learning_rate": 1.570716382516913e-07, "loss": 0.0001, "num_tokens": 11333709.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8548106516793791, "frac_reward_zero_std": 1.0, "grad_norm": 0.013402253209164782, "kl": 0.001110076904296875, "learning_rate": 1.5686691607392823e-07, "loss": 0.0, "num_tokens": 11337734.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 348.375, "completions/mean_terminated_length": 348.375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8550782818145323, "frac_reward_zero_std": 0.5, "grad_norm": 0.7975993865537471, "kl": 0.001377105712890625, "learning_rate": 1.566625369684004e-07, "loss": 0.0247, "num_tokens": 11341817.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 585.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 420.125, "completions/mean_terminated_length": 420.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.8553459119496856, "frac_reward_zero_std": 1.0, "grad_norm": 0.009476390534999957, "kl": 0.0008525848388671875, "learning_rate": 1.5645850111346205e-07, "loss": 0.0, "num_tokens": 11346514.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 251.0, "completions/mean_terminated_length": 251.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8556135420848388, "frac_reward_zero_std": 1.0, "grad_norm": 0.03579610093410627, "kl": 0.002655029296875, "learning_rate": 1.562548086871671e-07, "loss": 0.0001, "num_tokens": 11349630.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 330.5, "completions/mean_terminated_length": 330.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.8558811722199919, "frac_reward_zero_std": 1.0, "grad_norm": 0.019154274198136194, "kl": 0.0008831024169921875, "learning_rate": 1.5605145986727055e-07, "loss": 0.0, "num_tokens": 11353342.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 591.0, "completions/max_terminated_length": 591.0, "completions/mean_length": 448.0, "completions/mean_terminated_length": 448.0, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8561488023551452, "frac_reward_zero_std": 0.5, "grad_norm": 0.8497641343024486, "kl": 0.001163482666015625, "learning_rate": 1.5584845483122685e-07, "loss": 0.0838, "num_tokens": 11358018.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 431.75, "completions/mean_terminated_length": 431.75, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.8564164324902984, "frac_reward_zero_std": 1.0, "grad_norm": 0.017322678389727807, "kl": 0.001140594482421875, "learning_rate": 1.5564579375619073e-07, "loss": 0.0, "num_tokens": 11362708.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 313.0, "completions/mean_terminated_length": 313.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8566840626254516, "frac_reward_zero_std": 1.0, "grad_norm": 0.0134847218004372, "kl": 0.0013580322265625, "learning_rate": 1.554434768190171e-07, "loss": 0.0001, "num_tokens": 11366328.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 379.5, "completions/mean_terminated_length": 379.5, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.8569516927606049, "frac_reward_zero_std": 1.0, "grad_norm": 0.03109285046391732, "kl": 0.0020599365234375, "learning_rate": 1.5524150419625985e-07, "loss": 0.0001, "num_tokens": 11370292.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 423.5, "completions/mean_terminated_length": 423.5, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "epoch": 0.8572193228957581, "frac_reward_zero_std": 1.0, "grad_norm": 0.020195506491393976, "kl": 0.0016021728515625, "learning_rate": 1.5503987606417306e-07, "loss": 0.0001, "num_tokens": 11374824.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 276.125, "completions/mean_terminated_length": 276.125, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8574869530309113, "frac_reward_zero_std": 0.5, "grad_norm": 0.777808866723547, "kl": 0.002414703369140625, "learning_rate": 1.5483859259870968e-07, "loss": 0.0036, "num_tokens": 11378077.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 292.625, "completions/mean_terminated_length": 292.625, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8577545831660645, "frac_reward_zero_std": 1.0, "grad_norm": 0.017030795167506996, "kl": 0.00116729736328125, "learning_rate": 1.546376539755223e-07, "loss": 0.0, "num_tokens": 11381338.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 256.625, "completions/mean_terminated_length": 256.625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8580222133012178, "frac_reward_zero_std": 0.5, "grad_norm": 1.4521823028047534, "kl": 0.001922607421875, "learning_rate": 1.5443706036996223e-07, "loss": 0.0124, "num_tokens": 11384667.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 290.375, "completions/mean_terminated_length": 290.375, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.8582898434363709, "frac_reward_zero_std": 1.0, "grad_norm": 0.013884538763303655, "kl": 0.001216888427734375, "learning_rate": 1.5423681195707995e-07, "loss": 0.0, "num_tokens": 11388366.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.8585574735715241, "frac_reward_zero_std": 1.0, "grad_norm": 0.021138505398270585, "kl": 0.001346588134765625, "learning_rate": 1.540369089116246e-07, "loss": 0.0001, "num_tokens": 11391239.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 310.625, "completions/mean_terminated_length": 310.625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8588251037066774, "frac_reward_zero_std": 1.0, "grad_norm": 0.033954322658901935, "kl": 0.0022125244140625, "learning_rate": 1.5383735140804386e-07, "loss": 0.0001, "num_tokens": 11394924.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 424.875, "completions/mean_terminated_length": 424.875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.8590927338418306, "frac_reward_zero_std": 1.0, "grad_norm": 0.019000130230344487, "kl": 0.001995086669921875, "learning_rate": 1.5363813962048405e-07, "loss": 0.0001, "num_tokens": 11399643.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 290.125, "completions/mean_terminated_length": 290.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.8593603639769838, "frac_reward_zero_std": 1.0, "grad_norm": 0.04124164948568094, "kl": 0.0019378662109375, "learning_rate": 1.534392737227896e-07, "loss": 0.0001, "num_tokens": 11403260.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 339.0, "completions/mean_terminated_length": 339.0, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.859627994112137, "frac_reward_zero_std": 1.0, "grad_norm": 0.010770780945111821, "kl": 0.0007419586181640625, "learning_rate": 1.5324075388850325e-07, "loss": 0.0, "num_tokens": 11406896.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8598956242472903, "frac_reward_zero_std": 1.0, "grad_norm": 0.026805732523061728, "kl": 0.001819610595703125, "learning_rate": 1.5304258029086568e-07, "loss": 0.0001, "num_tokens": 11410412.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 496.0, "completions/max_terminated_length": 496.0, "completions/mean_length": 268.75, "completions/mean_terminated_length": 268.75, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.8601632543824435, "frac_reward_zero_std": 1.0, "grad_norm": 0.020796305037466635, "kl": 0.001110076904296875, "learning_rate": 1.528447531028154e-07, "loss": 0.0, "num_tokens": 11413710.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 318.5, "completions/mean_terminated_length": 318.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.8604308845175966, "frac_reward_zero_std": 1.0, "grad_norm": 0.008603641917602753, "kl": 0.000644683837890625, "learning_rate": 1.526472724969886e-07, "loss": 0.0, "num_tokens": 11417306.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8606985146527499, "frac_reward_zero_std": 1.0, "grad_norm": 0.014337149283225072, "kl": 0.00145721435546875, "learning_rate": 1.5245013864571915e-07, "loss": 0.0001, "num_tokens": 11420306.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 664.0, "completions/max_terminated_length": 664.0, "completions/mean_length": 409.375, "completions/mean_terminated_length": 409.375, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.8609661447879031, "frac_reward_zero_std": 1.0, "grad_norm": 0.014419691711768831, "kl": 0.001125335693359375, "learning_rate": 1.522533517210382e-07, "loss": 0.0, "num_tokens": 11424605.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 305.5, "completions/mean_terminated_length": 305.5, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.8612337749230563, "frac_reward_zero_std": 1.0, "grad_norm": 0.014914223308609621, "kl": 0.000896453857421875, "learning_rate": 1.520569118946743e-07, "loss": 0.0, "num_tokens": 11428129.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 229.375, "completions/mean_terminated_length": 229.375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.8615014050582096, "frac_reward_zero_std": 1.0, "grad_norm": 0.021654294163288818, "kl": 0.001556396484375, "learning_rate": 1.5186081933805267e-07, "loss": 0.0001, "num_tokens": 11431176.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.8617690351933628, "frac_reward_zero_std": 1.0, "grad_norm": 0.0171835404343351, "kl": 0.00115203857421875, "learning_rate": 1.5166507422229615e-07, "loss": 0.0, "num_tokens": 11434756.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 262.875, "completions/mean_terminated_length": 262.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.862036665328516, "frac_reward_zero_std": 1.0, "grad_norm": 0.011532193520869975, "kl": 0.0006237030029296875, "learning_rate": 1.5146967671822386e-07, "loss": 0.0, "num_tokens": 11437707.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 386.125, "completions/mean_terminated_length": 386.125, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.8623042954636692, "frac_reward_zero_std": 1.0, "grad_norm": 0.014341642126750402, "kl": 0.00106048583984375, "learning_rate": 1.5127462699635176e-07, "loss": 0.0, "num_tokens": 11441856.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 280.875, "completions/mean_terminated_length": 280.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8625719255988225, "frac_reward_zero_std": 1.0, "grad_norm": 0.02019046100215169, "kl": 0.00189208984375, "learning_rate": 1.5107992522689238e-07, "loss": 0.0001, "num_tokens": 11445095.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.8628395557339756, "frac_reward_zero_std": 1.0, "grad_norm": 0.01909996448676913, "kl": 0.0013256072998046875, "learning_rate": 1.508855715797543e-07, "loss": 0.0001, "num_tokens": 11447940.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 411.125, "completions/mean_terminated_length": 411.125, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "epoch": 0.8631071858691288, "frac_reward_zero_std": 1.0, "grad_norm": 0.01114413675896335, "kl": 0.001155853271484375, "learning_rate": 1.5069156622454285e-07, "loss": 0.0, "num_tokens": 11452533.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 231.25, "completions/mean_terminated_length": 231.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.8633748160042821, "frac_reward_zero_std": 1.0, "grad_norm": 0.015815218554357285, "kl": 0.0008258819580078125, "learning_rate": 1.5049790933055878e-07, "loss": 0.0, "num_tokens": 11455311.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 670.0, "completions/max_terminated_length": 670.0, "completions/mean_length": 334.125, "completions/mean_terminated_length": 334.125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.8636424461394353, "frac_reward_zero_std": 1.0, "grad_norm": 0.03538156796676065, "kl": 0.00128936767578125, "learning_rate": 1.5030460106679935e-07, "loss": 0.0001, "num_tokens": 11458940.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 257.375, "completions/mean_terminated_length": 257.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8639100762745885, "frac_reward_zero_std": 1.0, "grad_norm": 0.02101699008561679, "kl": 0.00208282470703125, "learning_rate": 1.501116416019571e-07, "loss": 0.0001, "num_tokens": 11462003.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 241.75, "completions/mean_terminated_length": 241.75, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.8641777064097418, "frac_reward_zero_std": 1.0, "grad_norm": 0.015616299673025769, "kl": 0.0011692047119140625, "learning_rate": 1.4991903110442033e-07, "loss": 0.0, "num_tokens": 11465181.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 270.5, "completions/mean_terminated_length": 270.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.864445336544895, "frac_reward_zero_std": 1.0, "grad_norm": 0.029062978209720862, "kl": 0.002166748046875, "learning_rate": 1.4972676974227325e-07, "loss": 0.0001, "num_tokens": 11468581.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 288.5, "completions/mean_terminated_length": 288.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8647129666800482, "frac_reward_zero_std": 1.0, "grad_norm": 0.022821015478212803, "kl": 0.0016937255859375, "learning_rate": 1.495348576832945e-07, "loss": 0.0001, "num_tokens": 11472017.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 580.5, "completions/mean_terminated_length": 517.1428833007812, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8649805968152013, "frac_reward_zero_std": 1.0, "grad_norm": 0.015250235685493033, "kl": 0.001373291015625, "learning_rate": 1.4934329509495888e-07, "loss": 0.0001, "num_tokens": 11477793.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 353.375, "completions/mean_terminated_length": 353.375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.8652482269503546, "frac_reward_zero_std": 0.0, "grad_norm": 1.5121950279398289, "kl": 0.0024871826171875, "learning_rate": 1.4915208214443547e-07, "loss": 0.0245, "num_tokens": 11481740.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.8655158570855078, "frac_reward_zero_std": 1.0, "grad_norm": 0.016213407606965158, "kl": 0.001361846923828125, "learning_rate": 1.4896121899858858e-07, "loss": 0.0001, "num_tokens": 11484865.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 242.875, "completions/mean_terminated_length": 242.875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.865783487220661, "frac_reward_zero_std": 1.0, "grad_norm": 0.018470479065097863, "kl": 0.001575469970703125, "learning_rate": 1.4877070582397713e-07, "loss": 0.0001, "num_tokens": 11487960.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.8660511173558143, "frac_reward_zero_std": 1.0, "grad_norm": 0.02504078234522752, "kl": 0.0025787353515625, "learning_rate": 1.4858054278685478e-07, "loss": 0.0001, "num_tokens": 11492319.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 369.5, "completions/mean_terminated_length": 369.5, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.8663187474909675, "frac_reward_zero_std": 1.0, "grad_norm": 0.02938577752928488, "kl": 0.00107574462890625, "learning_rate": 1.4839073005316955e-07, "loss": 0.0, "num_tokens": 11496315.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 346.875, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8665863776261207, "frac_reward_zero_std": 0.5, "grad_norm": 0.9152616910132271, "kl": 0.00157928466796875, "learning_rate": 1.4820126778856375e-07, "loss": -0.0068, "num_tokens": 11500122.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.866854007761274, "frac_reward_zero_std": 1.0, "grad_norm": 0.019924390697894066, "kl": 0.00139617919921875, "learning_rate": 1.4801215615837383e-07, "loss": 0.0001, "num_tokens": 11503269.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 321.875, "completions/mean_terminated_length": 321.875, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8671216378964272, "frac_reward_zero_std": 1.0, "grad_norm": 0.010765772459700135, "kl": 0.001003265380859375, "learning_rate": 1.4782339532763032e-07, "loss": 0.0, "num_tokens": 11507000.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8673892680315803, "frac_reward_zero_std": 1.0, "grad_norm": 0.020147183559636548, "kl": 0.0011749267578125, "learning_rate": 1.4763498546105762e-07, "loss": 0.0, "num_tokens": 11510206.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 306.0, "completions/mean_terminated_length": 306.0, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8676568981667335, "frac_reward_zero_std": 1.0, "grad_norm": 0.020062256077434462, "kl": 0.001987457275390625, "learning_rate": 1.4744692672307378e-07, "loss": 0.0001, "num_tokens": 11513930.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.8679245283018868, "frac_reward_zero_std": 1.0, "grad_norm": 0.018728181740740226, "kl": 0.001239776611328125, "learning_rate": 1.4725921927779053e-07, "loss": 0.0, "num_tokens": 11516970.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 354.875, "completions/mean_terminated_length": 354.875, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.86819215843704, "frac_reward_zero_std": 0.5, "grad_norm": 0.5479738812971727, "kl": 0.0012054443359375, "learning_rate": 1.4707186328901295e-07, "loss": 0.0, "num_tokens": 11520949.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 769.0, "completions/max_terminated_length": 769.0, "completions/mean_length": 507.875, "completions/mean_terminated_length": 507.875, "completions/min_length": 364.0, "completions/min_terminated_length": 364.0, "epoch": 0.8684597885721932, "frac_reward_zero_std": 1.0, "grad_norm": 0.010146592410904647, "kl": 0.0008716583251953125, "learning_rate": 1.4688485892023946e-07, "loss": 0.0, "num_tokens": 11526460.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 249.625, "completions/mean_terminated_length": 249.625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.8687274187073465, "frac_reward_zero_std": 1.0, "grad_norm": 0.012874264867100593, "kl": 0.0008087158203125, "learning_rate": 1.4669820633466167e-07, "loss": 0.0, "num_tokens": 11529449.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 353.25, "completions/mean_terminated_length": 353.25, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.8689950488424997, "frac_reward_zero_std": 1.0, "grad_norm": 0.0191938611979629, "kl": 0.001972198486328125, "learning_rate": 1.4651190569516424e-07, "loss": 0.0001, "num_tokens": 11533527.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.8692626789776529, "frac_reward_zero_std": 1.0, "grad_norm": 0.02334943591543574, "kl": 0.001903533935546875, "learning_rate": 1.4632595716432432e-07, "loss": 0.0001, "num_tokens": 11536685.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 329.25, "completions/mean_terminated_length": 329.25, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.8695303091128062, "frac_reward_zero_std": 1.0, "grad_norm": 0.009254648862361967, "kl": 0.0007724761962890625, "learning_rate": 1.4614036090441243e-07, "loss": 0.0, "num_tokens": 11540479.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 293.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 200.125, "completions/mean_terminated_length": 200.125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.8697979392479593, "frac_reward_zero_std": 1.0, "grad_norm": 0.015068655081893673, "kl": 0.000812530517578125, "learning_rate": 1.459551170773912e-07, "loss": 0.0, "num_tokens": 11542952.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 305.625, "completions/mean_terminated_length": 305.625, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.8700655693831125, "frac_reward_zero_std": 0.5, "grad_norm": 1.0839708830044423, "kl": 0.0024261474609375, "learning_rate": 1.4577022584491585e-07, "loss": 0.0034, "num_tokens": 11546553.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 308.75, "completions/mean_terminated_length": 308.75, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.8703331995182657, "frac_reward_zero_std": 1.0, "grad_norm": 0.018611878244658892, "kl": 0.001255035400390625, "learning_rate": 1.4558568736833404e-07, "loss": 0.0001, "num_tokens": 11549975.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 288.0, "completions/max_terminated_length": 288.0, "completions/mean_length": 245.625, "completions/mean_terminated_length": 245.625, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.870600829653419, "frac_reward_zero_std": 1.0, "grad_norm": 0.018848880925822088, "kl": 0.001331329345703125, "learning_rate": 1.4540150180868513e-07, "loss": 0.0001, "num_tokens": 11553020.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8708684597885722, "frac_reward_zero_std": 1.0, "grad_norm": 0.039326137197433254, "kl": 0.0028076171875, "learning_rate": 1.4521766932670123e-07, "loss": 0.0001, "num_tokens": 11556133.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 424.0, "completions/max_terminated_length": 424.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.8711360899237254, "frac_reward_zero_std": 1.0, "grad_norm": 0.020620369641937017, "kl": 0.00150299072265625, "learning_rate": 1.4503419008280552e-07, "loss": 0.0001, "num_tokens": 11559522.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 309.0, "completions/max_terminated_length": 309.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.8714037200588787, "frac_reward_zero_std": 1.0, "grad_norm": 0.018285370447354288, "kl": 0.00167083740234375, "learning_rate": 1.4485106423711374e-07, "loss": 0.0001, "num_tokens": 11562568.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 527.0, "completions/max_terminated_length": 527.0, "completions/mean_length": 375.5, "completions/mean_terminated_length": 375.5, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.8716713501940319, "frac_reward_zero_std": 0.0, "grad_norm": 1.0633879809730296, "kl": 0.0019683837890625, "learning_rate": 1.446682919494325e-07, "loss": 0.0555, "num_tokens": 11566836.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 300.0, "completions/mean_terminated_length": 300.0, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.871938980329185, "frac_reward_zero_std": 1.0, "grad_norm": 0.016573466168942175, "kl": 0.001728057861328125, "learning_rate": 1.4448587337926028e-07, "loss": 0.0001, "num_tokens": 11570232.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 506.0, "completions/max_terminated_length": 506.0, "completions/mean_length": 360.5, "completions/mean_terminated_length": 360.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.8722066104643383, "frac_reward_zero_std": 0.5, "grad_norm": 1.0520463374756448, "kl": 0.001422882080078125, "learning_rate": 1.44303808685787e-07, "loss": 0.0126, "num_tokens": 11574260.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 358.125, "completions/mean_terminated_length": 358.125, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.8724742405994915, "frac_reward_zero_std": 1.0, "grad_norm": 0.014979686109277628, "kl": 0.0014801025390625, "learning_rate": 1.4412209802789323e-07, "loss": 0.0001, "num_tokens": 11578457.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 407.0, "completions/max_terminated_length": 407.0, "completions/mean_length": 292.5, "completions/mean_terminated_length": 292.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8727418707346447, "frac_reward_zero_std": 1.0, "grad_norm": 0.019831477688991572, "kl": 0.001079559326171875, "learning_rate": 1.4394074156415128e-07, "loss": 0.0, "num_tokens": 11581869.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8730095008697979, "frac_reward_zero_std": 1.0, "grad_norm": 0.016681584684958586, "kl": 0.00157928466796875, "learning_rate": 1.4375973945282378e-07, "loss": 0.0001, "num_tokens": 11585444.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 537.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 404.25, "completions/mean_terminated_length": 404.25, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.8732771310049512, "frac_reward_zero_std": 0.5, "grad_norm": 1.0328999076252168, "kl": 0.0020599365234375, "learning_rate": 1.435790918518644e-07, "loss": 0.0001, "num_tokens": 11589814.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 321.5, "completions/mean_terminated_length": 321.5, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.8735447611401044, "frac_reward_zero_std": 1.0, "grad_norm": 0.014461299877851958, "kl": 0.00138092041015625, "learning_rate": 1.4339879891891746e-07, "loss": 0.0001, "num_tokens": 11593490.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.8738123912752576, "frac_reward_zero_std": 1.0, "grad_norm": 0.01905916038426837, "kl": 0.00168609619140625, "learning_rate": 1.4321886081131767e-07, "loss": 0.0001, "num_tokens": 11597122.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 363.0, "completions/mean_terminated_length": 363.0, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8740800214104109, "frac_reward_zero_std": 1.0, "grad_norm": 0.012420990933481754, "kl": 0.00116729736328125, "learning_rate": 1.4303927768609014e-07, "loss": 0.0, "num_tokens": 11601146.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 349.625, "completions/mean_terminated_length": 349.625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.874347651545564, "frac_reward_zero_std": 1.0, "grad_norm": 0.014380545272598674, "kl": 0.001026153564453125, "learning_rate": 1.4286004969995027e-07, "loss": 0.0, "num_tokens": 11605211.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 296.5, "completions/mean_terminated_length": 296.5, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8746152816807172, "frac_reward_zero_std": 0.5, "grad_norm": 0.7742465589267088, "kl": 0.00244903564453125, "learning_rate": 1.4268117700930345e-07, "loss": 0.0235, "num_tokens": 11608631.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 556.0, "completions/max_terminated_length": 556.0, "completions/mean_length": 302.0, "completions/mean_terminated_length": 302.0, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.8748829118158704, "frac_reward_zero_std": 1.0, "grad_norm": 0.025967296155104333, "kl": 0.00171661376953125, "learning_rate": 1.4250265977024507e-07, "loss": 0.0001, "num_tokens": 11612259.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 392.0, "completions/max_terminated_length": 392.0, "completions/mean_length": 312.375, "completions/mean_terminated_length": 312.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.8751505419510237, "frac_reward_zero_std": 1.0, "grad_norm": 0.024579281574938668, "kl": 0.00177764892578125, "learning_rate": 1.4232449813856022e-07, "loss": 0.0001, "num_tokens": 11615766.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 295.0, "completions/mean_terminated_length": 295.0, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.8754181720861769, "frac_reward_zero_std": 1.0, "grad_norm": 0.024549968272007294, "kl": 0.0012187957763671875, "learning_rate": 1.4214669226972385e-07, "loss": 0.0, "num_tokens": 11619074.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 226.5, "completions/mean_terminated_length": 226.5, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8756858022213301, "frac_reward_zero_std": 1.0, "grad_norm": 0.02027726816113705, "kl": 0.001575469970703125, "learning_rate": 1.419692423189004e-07, "loss": 0.0001, "num_tokens": 11621762.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 314.625, "completions/mean_terminated_length": 314.625, "completions/min_length": 285.0, "completions/min_terminated_length": 285.0, "epoch": 0.8759534323564834, "frac_reward_zero_std": 1.0, "grad_norm": 0.00941718104955578, "kl": 0.0005350112915039062, "learning_rate": 1.4179214844094355e-07, "loss": 0.0, "num_tokens": 11625223.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8762210624916366, "frac_reward_zero_std": 1.0, "grad_norm": 0.013229972901986145, "kl": 0.000896453857421875, "learning_rate": 1.416154107903964e-07, "loss": 0.0, "num_tokens": 11628426.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 371.25, "completions/mean_terminated_length": 371.25, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8764886926267897, "frac_reward_zero_std": 1.0, "grad_norm": 0.012562024790888878, "kl": 0.0012054443359375, "learning_rate": 1.4143902952149113e-07, "loss": 0.0, "num_tokens": 11632660.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 309.0, "completions/mean_terminated_length": 309.0, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.876756322761943, "frac_reward_zero_std": 0.5, "grad_norm": 0.8437590033473458, "kl": 0.00113677978515625, "learning_rate": 1.4126300478814912e-07, "loss": 0.0625, "num_tokens": 11636304.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 230.875, "completions/mean_terminated_length": 230.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8770239528970962, "frac_reward_zero_std": 1.0, "grad_norm": 0.016996199103267506, "kl": 0.0010223388671875, "learning_rate": 1.410873367439801e-07, "loss": 0.0, "num_tokens": 11639207.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 284.0, "completions/mean_terminated_length": 284.0, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.8772915830322494, "frac_reward_zero_std": 1.0, "grad_norm": 0.046710137781365256, "kl": 0.002246856689453125, "learning_rate": 1.4091202554228312e-07, "loss": 0.0001, "num_tokens": 11642531.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 442.125, "completions/mean_terminated_length": 442.125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "epoch": 0.8775592131674026, "frac_reward_zero_std": 0.5, "grad_norm": 0.6097549742868938, "kl": 0.001125335693359375, "learning_rate": 1.4073707133604552e-07, "loss": 0.0014, "num_tokens": 11647320.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 330.25, "completions/mean_terminated_length": 330.25, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.8778268433025559, "frac_reward_zero_std": 1.0, "grad_norm": 0.013124005959808562, "kl": 0.001071929931640625, "learning_rate": 1.4056247427794315e-07, "loss": 0.0, "num_tokens": 11651434.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 222.5, "completions/mean_terminated_length": 222.5, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.8780944734377091, "frac_reward_zero_std": 1.0, "grad_norm": 0.024289522714681638, "kl": 0.001781463623046875, "learning_rate": 1.4038823452034028e-07, "loss": 0.0001, "num_tokens": 11654230.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 253.125, "completions/mean_terminated_length": 253.125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.8783621035728623, "frac_reward_zero_std": 1.0, "grad_norm": 0.040324263795010175, "kl": 0.002593994140625, "learning_rate": 1.4021435221528903e-07, "loss": 0.0001, "num_tokens": 11657515.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 443.75, "completions/mean_terminated_length": 443.75, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.8786297337080156, "frac_reward_zero_std": 0.5, "grad_norm": 0.7438350997485536, "kl": 0.001617431640625, "learning_rate": 1.4004082751453023e-07, "loss": 0.0167, "num_tokens": 11662141.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 489.0, "completions/max_terminated_length": 489.0, "completions/mean_length": 310.25, "completions/mean_terminated_length": 310.25, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.8788973638431687, "frac_reward_zero_std": 0.5, "grad_norm": 0.9896842109050719, "kl": 0.001331329345703125, "learning_rate": 1.3986766056949188e-07, "loss": 0.037, "num_tokens": 11665623.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 307.625, "completions/mean_terminated_length": 307.625, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8791649939783219, "frac_reward_zero_std": 1.0, "grad_norm": 0.010128394025409868, "kl": 0.000629425048828125, "learning_rate": 1.3969485153129053e-07, "loss": 0.0, "num_tokens": 11669316.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.8794326241134752, "frac_reward_zero_std": 1.0, "grad_norm": 0.02009910829098036, "kl": 0.001552581787109375, "learning_rate": 1.3952240055072972e-07, "loss": 0.0001, "num_tokens": 11673219.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 253.0, "completions/max_terminated_length": 253.0, "completions/mean_length": 216.625, "completions/mean_terminated_length": 216.625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.8797002542486284, "frac_reward_zero_std": 1.0, "grad_norm": 0.01407595566422158, "kl": 0.000850677490234375, "learning_rate": 1.3935030777830095e-07, "loss": 0.0, "num_tokens": 11675916.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 352.375, "completions/mean_terminated_length": 352.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.8799678843837816, "frac_reward_zero_std": 1.0, "grad_norm": 0.020518837779481772, "kl": 0.001422882080078125, "learning_rate": 1.391785733641831e-07, "loss": 0.0001, "num_tokens": 11679887.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 227.25, "completions/mean_terminated_length": 227.25, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.8802355145189348, "frac_reward_zero_std": 1.0, "grad_norm": 0.01587290386121757, "kl": 0.0013275146484375, "learning_rate": 1.3900719745824204e-07, "loss": 0.0001, "num_tokens": 11682633.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 309.125, "completions/mean_terminated_length": 309.125, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.8805031446540881, "frac_reward_zero_std": 0.5, "grad_norm": 0.7515497957628121, "kl": 0.003570556640625, "learning_rate": 1.3883618021003133e-07, "loss": 0.0649, "num_tokens": 11686278.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 369.625, "completions/mean_terminated_length": 369.625, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.8807707747892413, "frac_reward_zero_std": 0.5, "grad_norm": 0.7101631562180005, "kl": 0.00274658203125, "learning_rate": 1.3866552176879073e-07, "loss": 0.0001, "num_tokens": 11690447.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 214.875, "completions/mean_terminated_length": 214.875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8810384049243944, "frac_reward_zero_std": 1.0, "grad_norm": 0.04582185046506439, "kl": 0.00249481201171875, "learning_rate": 1.3849522228344778e-07, "loss": 0.0001, "num_tokens": 11693226.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 277.375, "completions/mean_terminated_length": 277.375, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.8813060350595477, "frac_reward_zero_std": 1.0, "grad_norm": 0.014116544942924265, "kl": 0.001789093017578125, "learning_rate": 1.3832528190261607e-07, "loss": 0.0001, "num_tokens": 11696477.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 268.5, "completions/mean_terminated_length": 268.5, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.8815736651947009, "frac_reward_zero_std": 1.0, "grad_norm": 0.02105901712782323, "kl": 0.0013179779052734375, "learning_rate": 1.3815570077459615e-07, "loss": 0.0001, "num_tokens": 11699637.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 277.125, "completions/mean_terminated_length": 277.125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.8818412953298541, "frac_reward_zero_std": 1.0, "grad_norm": 0.022182063072876994, "kl": 0.001583099365234375, "learning_rate": 1.3798647904737505e-07, "loss": 0.0001, "num_tokens": 11702894.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 256.0, "completions/max_terminated_length": 256.0, "completions/mean_length": 207.0, "completions/mean_terminated_length": 207.0, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.8821089254650074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0413361238835411, "kl": 0.001926422119140625, "learning_rate": 1.3781761686862602e-07, "loss": 0.0001, "num_tokens": 11705530.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 378.875, "completions/mean_terminated_length": 378.875, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8823765556001606, "frac_reward_zero_std": 1.0, "grad_norm": 0.009611900506594358, "kl": 0.00063323974609375, "learning_rate": 1.376491143857087e-07, "loss": 0.0, "num_tokens": 11709609.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 325.5, "completions/mean_terminated_length": 325.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8826441857353138, "frac_reward_zero_std": 0.5, "grad_norm": 1.002730072772405, "kl": 0.001209259033203125, "learning_rate": 1.3748097174566877e-07, "loss": 0.0012, "num_tokens": 11713261.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 202.25, "completions/mean_terminated_length": 202.25, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.882911815870467, "frac_reward_zero_std": 1.0, "grad_norm": 0.01697051041019356, "kl": 0.001453399658203125, "learning_rate": 1.3731318909523793e-07, "loss": 0.0001, "num_tokens": 11715811.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 321.625, "completions/mean_terminated_length": 321.625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.8831794460056203, "frac_reward_zero_std": 0.5, "grad_norm": 0.6723937232243807, "kl": 0.001087188720703125, "learning_rate": 1.3714576658083355e-07, "loss": 0.0428, "num_tokens": 11719540.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 340.75, "completions/mean_terminated_length": 340.75, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.8834470761407734, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075907729988561575, "kl": 0.0005435943603515625, "learning_rate": 1.3697870434855902e-07, "loss": 0.0, "num_tokens": 11723266.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 191.75, "completions/mean_terminated_length": 191.75, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8837147062759266, "frac_reward_zero_std": 1.0, "grad_norm": 0.019006986166483043, "kl": 0.0016021728515625, "learning_rate": 1.3681200254420316e-07, "loss": 0.0001, "num_tokens": 11725676.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 250.0, "completions/max_terminated_length": 250.0, "completions/mean_length": 214.375, "completions/mean_terminated_length": 214.375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.8839823364110799, "frac_reward_zero_std": 1.0, "grad_norm": 0.011070546529394326, "kl": 0.00102996826171875, "learning_rate": 1.3664566131324017e-07, "loss": 0.0, "num_tokens": 11728335.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 357.0, "completions/mean_terminated_length": 357.0, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.8842499665462331, "frac_reward_zero_std": 1.0, "grad_norm": 0.021738009980176987, "kl": 0.00189971923828125, "learning_rate": 1.3647968080082978e-07, "loss": 0.0001, "num_tokens": 11732339.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 410.875, "completions/mean_terminated_length": 410.875, "completions/min_length": 328.0, "completions/min_terminated_length": 328.0, "epoch": 0.8845175966813863, "frac_reward_zero_std": 0.5, "grad_norm": 0.5605667212125439, "kl": 0.0013408660888671875, "learning_rate": 1.363140611518169e-07, "loss": -0.0173, "num_tokens": 11736574.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 984.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 390.875, "completions/mean_terminated_length": 390.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.8847852268165396, "frac_reward_zero_std": 1.0, "grad_norm": 0.026018278174394727, "kl": 0.001438140869140625, "learning_rate": 1.3614880251073128e-07, "loss": 0.0001, "num_tokens": 11740749.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 294.125, "completions/mean_terminated_length": 294.125, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.8850528569516928, "frac_reward_zero_std": 1.0, "grad_norm": 0.014986413306270601, "kl": 0.001407623291015625, "learning_rate": 1.3598390502178797e-07, "loss": 0.0001, "num_tokens": 11744314.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 255.5, "completions/mean_terminated_length": 255.5, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.885320487086846, "frac_reward_zero_std": 1.0, "grad_norm": 0.032700510550119605, "kl": 0.00186920166015625, "learning_rate": 1.3581936882888666e-07, "loss": 0.0001, "num_tokens": 11747398.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 465.0, "completions/max_terminated_length": 465.0, "completions/mean_length": 352.0, "completions/mean_terminated_length": 352.0, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.8855881172219991, "frac_reward_zero_std": 1.0, "grad_norm": 0.012870990557732586, "kl": 0.001354217529296875, "learning_rate": 1.356551940756119e-07, "loss": 0.0001, "num_tokens": 11751442.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 315.5, "completions/mean_terminated_length": 315.5, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8858557473571524, "frac_reward_zero_std": 1.0, "grad_norm": 0.017368528871562483, "kl": 0.001049041748046875, "learning_rate": 1.3549138090523277e-07, "loss": 0.0, "num_tokens": 11754978.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 388.0, "completions/max_terminated_length": 388.0, "completions/mean_length": 289.125, "completions/mean_terminated_length": 289.125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.8861233774923056, "frac_reward_zero_std": 1.0, "grad_norm": 0.013495947072621792, "kl": 0.001132965087890625, "learning_rate": 1.3532792946070242e-07, "loss": 0.0, "num_tokens": 11758323.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.8863910076274588, "frac_reward_zero_std": 1.0, "grad_norm": 0.015094485009927359, "kl": 0.00128173828125, "learning_rate": 1.3516483988465913e-07, "loss": 0.0001, "num_tokens": 11762055.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.8866586377626121, "frac_reward_zero_std": 1.0, "grad_norm": 0.01055198812851668, "kl": 0.0005855560302734375, "learning_rate": 1.350021123194246e-07, "loss": 0.0, "num_tokens": 11765453.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 449.875, "completions/mean_terminated_length": 449.875, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.8869262678977653, "frac_reward_zero_std": 0.5, "grad_norm": 0.9482197212148807, "kl": 0.00267791748046875, "learning_rate": 1.3483974690700528e-07, "loss": 0.0566, "num_tokens": 11770296.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 329.125, "completions/mean_terminated_length": 329.125, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.8871938980329185, "frac_reward_zero_std": 1.0, "grad_norm": 0.019940270418221617, "kl": 0.00212860107421875, "learning_rate": 1.346777437890909e-07, "loss": 0.0001, "num_tokens": 11774161.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 321.0, "completions/max_terminated_length": 321.0, "completions/mean_length": 252.375, "completions/mean_terminated_length": 252.375, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.8874615281680718, "frac_reward_zero_std": 1.0, "grad_norm": 0.013521482656882211, "kl": 0.001270294189453125, "learning_rate": 1.3451610310705574e-07, "loss": 0.0001, "num_tokens": 11777340.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 427.125, "completions/mean_terminated_length": 427.125, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.887729158303225, "frac_reward_zero_std": 1.0, "grad_norm": 0.014644033220552174, "kl": 0.0009002685546875, "learning_rate": 1.343548250019573e-07, "loss": 0.0, "num_tokens": 11781933.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 251.125, "completions/mean_terminated_length": 251.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.8879967884383781, "frac_reward_zero_std": 1.0, "grad_norm": 0.014758163010907958, "kl": 0.0014190673828125, "learning_rate": 1.3419390961453673e-07, "loss": 0.0001, "num_tokens": 11784914.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 373.0, "completions/max_terminated_length": 373.0, "completions/mean_length": 311.5, "completions/mean_terminated_length": 311.5, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.8882644185735313, "frac_reward_zero_std": 1.0, "grad_norm": 0.012223930421073502, "kl": 0.0011272430419921875, "learning_rate": 1.34033357085219e-07, "loss": 0.0, "num_tokens": 11788298.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 370.0, "completions/mean_terminated_length": 370.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.8885320487086846, "frac_reward_zero_std": 0.5, "grad_norm": 0.6499400232675808, "kl": 0.001354217529296875, "learning_rate": 1.3387316755411188e-07, "loss": -0.0396, "num_tokens": 11792290.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.8887996788438378, "frac_reward_zero_std": 1.0, "grad_norm": 0.054012533364251804, "kl": 0.0026092529296875, "learning_rate": 1.3371334116100693e-07, "loss": 0.0001, "num_tokens": 11795139.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 494.0, "completions/max_terminated_length": 494.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.889067308978991, "frac_reward_zero_std": 1.0, "grad_norm": 0.01692516958621795, "kl": 0.001613616943359375, "learning_rate": 1.335538780453784e-07, "loss": 0.0001, "num_tokens": 11798707.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 340.25, "completions/mean_terminated_length": 340.25, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.8893349391141443, "frac_reward_zero_std": 1.0, "grad_norm": 0.014881022775111073, "kl": 0.001277923583984375, "learning_rate": 1.3339477834638364e-07, "loss": 0.0001, "num_tokens": 11802533.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 265.625, "completions/mean_terminated_length": 265.625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.8896025692492975, "frac_reward_zero_std": 1.0, "grad_norm": 0.013522364855609653, "kl": 0.00107574462890625, "learning_rate": 1.3323604220286292e-07, "loss": 0.0, "num_tokens": 11805706.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 426.75, "completions/mean_terminated_length": 426.75, "completions/min_length": 338.0, "completions/min_terminated_length": 338.0, "epoch": 0.8898701993844507, "frac_reward_zero_std": 1.0, "grad_norm": 0.10093347334199138, "kl": 0.004150390625, "learning_rate": 1.330776697533392e-07, "loss": 0.0002, "num_tokens": 11810420.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 347.0, "completions/max_terminated_length": 347.0, "completions/mean_length": 283.5, "completions/mean_terminated_length": 283.5, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.890137829519604, "frac_reward_zero_std": 1.0, "grad_norm": 0.025078473541529783, "kl": 0.0014190673828125, "learning_rate": 1.3291966113601815e-07, "loss": 0.0001, "num_tokens": 11813812.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 265.0, "completions/max_terminated_length": 265.0, "completions/mean_length": 213.5, "completions/mean_terminated_length": 213.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.8904054596547571, "frac_reward_zero_std": 1.0, "grad_norm": 0.014167460308796238, "kl": 0.001117706298828125, "learning_rate": 1.3276201648878779e-07, "loss": 0.0, "num_tokens": 11816484.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 323.0, "completions/max_terminated_length": 323.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 225.125, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.8906730897899103, "frac_reward_zero_std": 1.0, "grad_norm": 0.021413907334862353, "kl": 0.001659393310546875, "learning_rate": 1.3260473594921853e-07, "loss": 0.0001, "num_tokens": 11819361.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8909407199250635, "frac_reward_zero_std": 1.0, "grad_norm": 0.01630391844444648, "kl": 0.0010547637939453125, "learning_rate": 1.324478196545632e-07, "loss": 0.0, "num_tokens": 11822509.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 274.25, "completions/mean_terminated_length": 274.25, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.8912083500602168, "frac_reward_zero_std": 1.0, "grad_norm": 0.012691465612719433, "kl": 0.001064300537109375, "learning_rate": 1.3229126774175664e-07, "loss": 0.0, "num_tokens": 11825663.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 428.75, "completions/mean_terminated_length": 428.75, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.89147598019537, "frac_reward_zero_std": 0.5, "grad_norm": 0.7182040009800023, "kl": 0.0015411376953125, "learning_rate": 1.3213508034741566e-07, "loss": 0.0394, "num_tokens": 11830317.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 271.75, "completions/mean_terminated_length": 271.75, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.8917436103305232, "frac_reward_zero_std": 1.0, "grad_norm": 0.01437666970326668, "kl": 0.001079559326171875, "learning_rate": 1.3197925760783913e-07, "loss": 0.0, "num_tokens": 11833439.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 272.875, "completions/mean_terminated_length": 272.875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.8920112404656765, "frac_reward_zero_std": 1.0, "grad_norm": 0.013728777561401593, "kl": 0.0007686614990234375, "learning_rate": 1.3182379965900756e-07, "loss": 0.0, "num_tokens": 11836546.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 299.75, "completions/mean_terminated_length": 299.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.8922788706008297, "frac_reward_zero_std": 1.0, "grad_norm": 0.015279263193182017, "kl": 0.001171112060546875, "learning_rate": 1.3166870663658314e-07, "loss": 0.0, "num_tokens": 11840024.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 243.625, "completions/mean_terminated_length": 243.625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.8925465007359829, "frac_reward_zero_std": 1.0, "grad_norm": 0.03127388289184648, "kl": 0.002223968505859375, "learning_rate": 1.3151397867590963e-07, "loss": 0.0001, "num_tokens": 11843025.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 341.75, "completions/mean_terminated_length": 341.75, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.892814130871136, "frac_reward_zero_std": 0.5, "grad_norm": 1.3888278169816346, "kl": 0.0042572021484375, "learning_rate": 1.3135961591201232e-07, "loss": -0.0742, "num_tokens": 11846771.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 330.875, "completions/mean_terminated_length": 330.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.8930817610062893, "frac_reward_zero_std": 1.0, "grad_norm": 0.04380427218932546, "kl": 0.00222015380859375, "learning_rate": 1.3120561847959753e-07, "loss": 0.0001, "num_tokens": 11850814.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 241.375, "completions/mean_terminated_length": 241.375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.8933493911414425, "frac_reward_zero_std": 1.0, "grad_norm": 0.029713470419907802, "kl": 0.00148773193359375, "learning_rate": 1.3105198651305307e-07, "loss": 0.0001, "num_tokens": 11853729.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.8936170212765957, "frac_reward_zero_std": 0.5, "grad_norm": 0.9388586521055, "kl": 0.00194549560546875, "learning_rate": 1.308987201464477e-07, "loss": 0.0452, "num_tokens": 11857278.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 325.375, "completions/mean_terminated_length": 325.375, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.893884651411749, "frac_reward_zero_std": 1.0, "grad_norm": 0.021026610042891296, "kl": 0.001117706298828125, "learning_rate": 1.3074581951353107e-07, "loss": 0.0, "num_tokens": 11861077.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 299.0, "completions/mean_terminated_length": 299.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.8941522815469022, "frac_reward_zero_std": 1.0, "grad_norm": 0.08099314513832376, "kl": 0.00405120849609375, "learning_rate": 1.3059328474773377e-07, "loss": 0.0002, "num_tokens": 11864965.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 265.25, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.8944199116820554, "frac_reward_zero_std": 1.0, "grad_norm": 0.014333758165650388, "kl": 0.0009002685546875, "learning_rate": 1.3044111598216696e-07, "loss": 0.0, "num_tokens": 11868047.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 221.5, "completions/mean_terminated_length": 221.5, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.8946875418172087, "frac_reward_zero_std": 1.0, "grad_norm": 0.018157135660669035, "kl": 0.00131988525390625, "learning_rate": 1.3028931334962268e-07, "loss": 0.0001, "num_tokens": 11870731.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 471.5, "completions/mean_terminated_length": 471.5, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "epoch": 0.8949551719523618, "frac_reward_zero_std": 1.0, "grad_norm": 0.010553808316099927, "kl": 0.00121307373046875, "learning_rate": 1.3013787698257307e-07, "loss": 0.0, "num_tokens": 11875771.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 265.125, "completions/mean_terminated_length": 265.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.895222802087515, "frac_reward_zero_std": 1.0, "grad_norm": 0.016847198924054934, "kl": 0.0014801025390625, "learning_rate": 1.2998680701317112e-07, "loss": 0.0001, "num_tokens": 11878996.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.8954904322226682, "frac_reward_zero_std": 1.0, "grad_norm": 0.014818665083415701, "kl": 0.00136566162109375, "learning_rate": 1.298361035732497e-07, "loss": 0.0001, "num_tokens": 11881859.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 225.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.8957580623578215, "frac_reward_zero_std": 1.0, "grad_norm": 0.017865187546950677, "kl": 0.000804901123046875, "learning_rate": 1.2968576679432186e-07, "loss": 0.0, "num_tokens": 11884616.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 396.875, "completions/mean_terminated_length": 396.875, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.8960256924929747, "frac_reward_zero_std": 0.5, "grad_norm": 0.6288641140466638, "kl": 0.00194549560546875, "learning_rate": 1.2953579680758102e-07, "loss": -0.0775, "num_tokens": 11889279.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 367.5, "completions/mean_terminated_length": 367.5, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "epoch": 0.8962933226281279, "frac_reward_zero_std": 1.0, "grad_norm": 0.01504527709624823, "kl": 0.001216888427734375, "learning_rate": 1.2938619374389994e-07, "loss": 0.0, "num_tokens": 11893259.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 299.125, "completions/mean_terminated_length": 299.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.8965609527632812, "frac_reward_zero_std": 1.0, "grad_norm": 0.015971881391814664, "kl": 0.001262664794921875, "learning_rate": 1.2923695773383185e-07, "loss": 0.0001, "num_tokens": 11896824.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 328.625, "completions/mean_terminated_length": 328.625, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.8968285828984344, "frac_reward_zero_std": 1.0, "grad_norm": 0.015299551186649721, "kl": 0.001186370849609375, "learning_rate": 1.29088088907609e-07, "loss": 0.0, "num_tokens": 11900681.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 450.0, "completions/max_terminated_length": 450.0, "completions/mean_length": 354.0, "completions/mean_terminated_length": 354.0, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.8970962130335876, "frac_reward_zero_std": 1.0, "grad_norm": 0.014941368382104274, "kl": 0.001743316650390625, "learning_rate": 1.289395873951437e-07, "loss": 0.0001, "num_tokens": 11904685.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 307.375, "completions/mean_terminated_length": 307.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.8973638431687408, "frac_reward_zero_std": 1.0, "grad_norm": 0.02260998844090309, "kl": 0.0014209747314453125, "learning_rate": 1.2879145332602754e-07, "loss": 0.0001, "num_tokens": 11908288.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 217.5, "completions/mean_terminated_length": 217.5, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.897631473303894, "frac_reward_zero_std": 1.0, "grad_norm": 0.012367147950646907, "kl": 0.001064300537109375, "learning_rate": 1.2864368682953144e-07, "loss": 0.0, "num_tokens": 11910964.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 380.125, "completions/mean_terminated_length": 380.125, "completions/min_length": 267.0, "completions/min_terminated_length": 267.0, "epoch": 0.8978991034390472, "frac_reward_zero_std": 1.0, "grad_norm": 0.032846421698860206, "kl": 0.0014495849609375, "learning_rate": 1.284962880346056e-07, "loss": 0.0001, "num_tokens": 11915117.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 262.5, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.8981667335742004, "frac_reward_zero_std": 1.0, "grad_norm": 0.010508001340635514, "kl": 0.0005931854248046875, "learning_rate": 1.283492570698793e-07, "loss": 0.0, "num_tokens": 11918153.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 469.875, "completions/mean_terminated_length": 469.875, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "epoch": 0.8984343637093537, "frac_reward_zero_std": 0.5, "grad_norm": 0.5873290086365266, "kl": 0.0009613037109375, "learning_rate": 1.282025940636609e-07, "loss": 0.0076, "num_tokens": 11923148.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.8987019938445069, "frac_reward_zero_std": 1.0, "grad_norm": 0.011513473381008775, "kl": 0.00101470947265625, "learning_rate": 1.280562991439375e-07, "loss": 0.0, "num_tokens": 11926944.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 263.0, "completions/max_terminated_length": 263.0, "completions/mean_length": 218.25, "completions/mean_terminated_length": 218.25, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.8989696239796601, "frac_reward_zero_std": 1.0, "grad_norm": 0.04489696748325492, "kl": 0.00222015380859375, "learning_rate": 1.279103724383752e-07, "loss": 0.0001, "num_tokens": 11929798.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 394.375, "completions/mean_terminated_length": 394.375, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.8992372541148134, "frac_reward_zero_std": 1.0, "grad_norm": 0.010572458657489146, "kl": 0.00078582763671875, "learning_rate": 1.2776481407431858e-07, "loss": 0.0, "num_tokens": 11933985.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 393.0, "completions/max_terminated_length": 393.0, "completions/mean_length": 298.75, "completions/mean_terminated_length": 298.75, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.8995048842499666, "frac_reward_zero_std": 1.0, "grad_norm": 0.023628550990079548, "kl": 0.00139617919921875, "learning_rate": 1.2761962417879099e-07, "loss": 0.0001, "num_tokens": 11937547.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 548.0, "completions/max_terminated_length": 548.0, "completions/mean_length": 358.5, "completions/mean_terminated_length": 358.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.8997725143851197, "frac_reward_zero_std": 0.5, "grad_norm": 0.8542947476690994, "kl": 0.00124359130859375, "learning_rate": 1.2747480287849397e-07, "loss": -0.0634, "num_tokens": 11941439.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 280.375, "completions/mean_terminated_length": 280.375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.900040144520273, "frac_reward_zero_std": 1.0, "grad_norm": 0.020277249403621772, "kl": 0.0013713836669921875, "learning_rate": 1.2733035029980764e-07, "loss": 0.0001, "num_tokens": 11944658.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 256.875, "completions/mean_terminated_length": 256.875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.9003077746554262, "frac_reward_zero_std": 1.0, "grad_norm": 0.018997014960177534, "kl": 0.001312255859375, "learning_rate": 1.2718626656879022e-07, "loss": 0.0001, "num_tokens": 11947761.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 280.0, "completions/max_terminated_length": 280.0, "completions/mean_length": 238.625, "completions/mean_terminated_length": 238.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.9005754047905794, "frac_reward_zero_std": 1.0, "grad_norm": 0.040095525058390515, "kl": 0.002010345458984375, "learning_rate": 1.2704255181117814e-07, "loss": 0.0001, "num_tokens": 11950658.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 292.875, "completions/mean_terminated_length": 292.875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.9008430349257326, "frac_reward_zero_std": 1.0, "grad_norm": 0.029382327202592567, "kl": 0.002117156982421875, "learning_rate": 1.268992061523856e-07, "loss": 0.0001, "num_tokens": 11953985.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 434.75, "completions/mean_terminated_length": 434.75, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.9011106650608859, "frac_reward_zero_std": 1.0, "grad_norm": 0.022145738204481055, "kl": 0.001293182373046875, "learning_rate": 1.267562297175051e-07, "loss": 0.0001, "num_tokens": 11958679.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 244.375, "completions/mean_terminated_length": 244.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.9013782951960391, "frac_reward_zero_std": 1.0, "grad_norm": 0.0594058963964623, "kl": 0.0009002685546875, "learning_rate": 1.2661362263130655e-07, "loss": 0.0, "num_tokens": 11961678.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 367.0, "completions/max_terminated_length": 367.0, "completions/mean_length": 288.75, "completions/mean_terminated_length": 288.75, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.9016459253311923, "frac_reward_zero_std": 1.0, "grad_norm": 0.022920695989908726, "kl": 0.002044677734375, "learning_rate": 1.2647138501823785e-07, "loss": 0.0001, "num_tokens": 11965136.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 238.375, "completions/mean_terminated_length": 238.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.9019135554663456, "frac_reward_zero_std": 1.0, "grad_norm": 0.02185887002866551, "kl": 0.002288818359375, "learning_rate": 1.263295170024243e-07, "loss": 0.0001, "num_tokens": 11968131.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 301.0, "completions/max_terminated_length": 301.0, "completions/mean_length": 266.625, "completions/mean_terminated_length": 266.625, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.9021811856014987, "frac_reward_zero_std": 1.0, "grad_norm": 0.02265382346789441, "kl": 0.001209259033203125, "learning_rate": 1.261880187076685e-07, "loss": 0.0, "num_tokens": 11971392.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 319.0, "completions/mean_terminated_length": 319.0, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.9024488157366519, "frac_reward_zero_std": 1.0, "grad_norm": 0.01231466624705516, "kl": 0.001129150390625, "learning_rate": 1.2604689025745097e-07, "loss": 0.0, "num_tokens": 11975052.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 273.875, "completions/mean_terminated_length": 273.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.9027164458718052, "frac_reward_zero_std": 1.0, "grad_norm": 0.022433559087263428, "kl": 0.00225067138671875, "learning_rate": 1.2590613177492875e-07, "loss": 0.0001, "num_tokens": 11978203.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 457.0, "completions/max_terminated_length": 457.0, "completions/mean_length": 326.5, "completions/mean_terminated_length": 326.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.9029840760069584, "frac_reward_zero_std": 0.5, "grad_norm": 1.47770053584515, "kl": 0.002475738525390625, "learning_rate": 1.2576574338293678e-07, "loss": -0.0325, "num_tokens": 11982079.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 303.625, "completions/mean_terminated_length": 303.625, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9032517061421116, "frac_reward_zero_std": 0.5, "grad_norm": 0.9773060309534615, "kl": 0.001556396484375, "learning_rate": 1.2562572520398636e-07, "loss": 0.0268, "num_tokens": 11985464.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 449.0, "completions/max_terminated_length": 449.0, "completions/mean_length": 348.875, "completions/mean_terminated_length": 348.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.9035193362772648, "frac_reward_zero_std": 0.5, "grad_norm": 1.0311573403350336, "kl": 0.001262664794921875, "learning_rate": 1.2548607736026604e-07, "loss": -0.0433, "num_tokens": 11989599.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 351.875, "completions/mean_terminated_length": 351.875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.9037869664124181, "frac_reward_zero_std": 1.0, "grad_norm": 0.024213359404565895, "kl": 0.00177764892578125, "learning_rate": 1.2534679997364132e-07, "loss": 0.0001, "num_tokens": 11993490.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 413.0, "completions/mean_terminated_length": 413.0, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.9040545965475713, "frac_reward_zero_std": 1.0, "grad_norm": 0.020329658068314067, "kl": 0.001190185546875, "learning_rate": 1.2520789316565405e-07, "loss": 0.0, "num_tokens": 11997970.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 385.25, "completions/mean_terminated_length": 385.25, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.9043222266827244, "frac_reward_zero_std": 1.0, "grad_norm": 0.014696556816096353, "kl": 0.0009307861328125, "learning_rate": 1.250693570575231e-07, "loss": 0.0, "num_tokens": 12002216.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 323.75, "completions/mean_terminated_length": 323.75, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9045898568178777, "frac_reward_zero_std": 0.5, "grad_norm": 0.9651406649557871, "kl": 0.001323699951171875, "learning_rate": 1.2493119177014353e-07, "loss": 0.0311, "num_tokens": 12005830.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 246.375, "completions/mean_terminated_length": 246.375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.9048574869530309, "frac_reward_zero_std": 1.0, "grad_norm": 0.010040280834857763, "kl": 0.0007495880126953125, "learning_rate": 1.247933974240869e-07, "loss": 0.0, "num_tokens": 12008781.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 451.625, "completions/mean_terminated_length": 451.625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.9051251170881841, "frac_reward_zero_std": 0.5, "grad_norm": 0.9120516343564652, "kl": 0.00145721435546875, "learning_rate": 1.2465597413960113e-07, "loss": -0.0276, "num_tokens": 12013798.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 373.0, "completions/mean_terminated_length": 373.0, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.9053927472233374, "frac_reward_zero_std": 1.0, "grad_norm": 0.010295360843305373, "kl": 0.00116729736328125, "learning_rate": 1.2451892203661026e-07, "loss": 0.0, "num_tokens": 12017870.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 230.625, "completions/mean_terminated_length": 230.625, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9056603773584906, "frac_reward_zero_std": 1.0, "grad_norm": 0.019234175823798463, "kl": 0.0015869140625, "learning_rate": 1.243822412347144e-07, "loss": 0.0001, "num_tokens": 12020671.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9059280074936438, "frac_reward_zero_std": 1.0, "grad_norm": 0.02193137535119513, "kl": 0.0016937255859375, "learning_rate": 1.242459318531897e-07, "loss": 0.0001, "num_tokens": 12023627.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 386.75, "completions/mean_terminated_length": 386.75, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.906195637628797, "frac_reward_zero_std": 1.0, "grad_norm": 0.007511786838176052, "kl": 0.0006256103515625, "learning_rate": 1.2410999401098818e-07, "loss": 0.0, "num_tokens": 12027881.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 445.625, "completions/mean_terminated_length": 445.625, "completions/min_length": 354.0, "completions/min_terminated_length": 354.0, "epoch": 0.9064632677639503, "frac_reward_zero_std": 1.0, "grad_norm": 0.007541757683486771, "kl": 0.0008392333984375, "learning_rate": 1.2397442782673752e-07, "loss": 0.0, "num_tokens": 12032550.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 447.0, "completions/max_terminated_length": 447.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.9067308978991034, "frac_reward_zero_std": 1.0, "grad_norm": 0.012224739942266184, "kl": 0.001033782958984375, "learning_rate": 1.2383923341874125e-07, "loss": 0.0, "num_tokens": 12035735.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 575.125, "completions/mean_terminated_length": 575.125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.9069985280342566, "frac_reward_zero_std": 0.5, "grad_norm": 0.5702902907023796, "kl": 0.001537322998046875, "learning_rate": 1.237044109049783e-07, "loss": 0.0056, "num_tokens": 12041588.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 313.25, "completions/mean_terminated_length": 313.25, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9072661581694099, "frac_reward_zero_std": 1.0, "grad_norm": 0.013217246770204817, "kl": 0.001155853271484375, "learning_rate": 1.235699604031031e-07, "loss": 0.0, "num_tokens": 12045070.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 313.0, "completions/max_terminated_length": 313.0, "completions/mean_length": 254.375, "completions/mean_terminated_length": 254.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9075337883045631, "frac_reward_zero_std": 1.0, "grad_norm": 0.014315002588792246, "kl": 0.0009975433349609375, "learning_rate": 1.2343588203044564e-07, "loss": 0.0, "num_tokens": 12048073.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 416.875, "completions/mean_terminated_length": 416.875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.9078014184397163, "frac_reward_zero_std": 1.0, "grad_norm": 0.016405334740362182, "kl": 0.00116729736328125, "learning_rate": 1.233021759040108e-07, "loss": 0.0, "num_tokens": 12052628.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 442.0, "completions/max_terminated_length": 442.0, "completions/mean_length": 327.875, "completions/mean_terminated_length": 327.875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.9080690485748695, "frac_reward_zero_std": 1.0, "grad_norm": 0.018972443107823436, "kl": 0.00176239013671875, "learning_rate": 1.231688421404789e-07, "loss": 0.0001, "num_tokens": 12056511.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 414.875, "completions/mean_terminated_length": 414.875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.9083366787100228, "frac_reward_zero_std": 1.0, "grad_norm": 0.015684181445653072, "kl": 0.001224517822265625, "learning_rate": 1.2303588085620531e-07, "loss": 0.0, "num_tokens": 12060974.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 334.75, "completions/mean_terminated_length": 334.75, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.908604308845176, "frac_reward_zero_std": 1.0, "grad_norm": 0.030468642818307887, "kl": 0.001781463623046875, "learning_rate": 1.2290329216722008e-07, "loss": 0.0001, "num_tokens": 12064748.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 350.125, "completions/mean_terminated_length": 350.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.9088719389803291, "frac_reward_zero_std": 1.0, "grad_norm": 0.021912406399222224, "kl": 0.0020599365234375, "learning_rate": 1.2277107618922842e-07, "loss": 0.0001, "num_tokens": 12068833.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 318.0, "completions/mean_terminated_length": 318.0, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9091395691154824, "frac_reward_zero_std": 1.0, "grad_norm": 0.00963820642019651, "kl": 0.000743865966796875, "learning_rate": 1.2263923303761016e-07, "loss": 0.0, "num_tokens": 12072421.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 264.375, "completions/mean_terminated_length": 264.375, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.9094071992506356, "frac_reward_zero_std": 1.0, "grad_norm": 0.01721838693356275, "kl": 0.001399993896484375, "learning_rate": 1.2250776282741975e-07, "loss": 0.0001, "num_tokens": 12075476.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 400.375, "completions/mean_terminated_length": 400.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9096748293857888, "frac_reward_zero_std": 1.0, "grad_norm": 0.011828768232140652, "kl": 0.0009479522705078125, "learning_rate": 1.223766656733863e-07, "loss": 0.0, "num_tokens": 12079663.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 384.5, "completions/mean_terminated_length": 384.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.9099424595209421, "frac_reward_zero_std": 1.0, "grad_norm": 0.016622693887619003, "kl": 0.001255035400390625, "learning_rate": 1.2224594168991315e-07, "loss": 0.0001, "num_tokens": 12084019.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 296.125, "completions/mean_terminated_length": 296.125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.9102100896560953, "frac_reward_zero_std": 0.5, "grad_norm": 0.7718346976092185, "kl": 0.00262451171875, "learning_rate": 1.221155909910783e-07, "loss": 0.0142, "num_tokens": 12087644.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 488.0, "completions/max_terminated_length": 488.0, "completions/mean_length": 303.375, "completions/mean_terminated_length": 303.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9104777197912485, "frac_reward_zero_std": 1.0, "grad_norm": 0.01721277661208508, "kl": 0.00115966796875, "learning_rate": 1.2198561369063365e-07, "loss": 0.0, "num_tokens": 12091215.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 259.75, "completions/mean_terminated_length": 259.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9107453499264017, "frac_reward_zero_std": 1.0, "grad_norm": 0.021411165547946597, "kl": 0.0019683837890625, "learning_rate": 1.218560099020056e-07, "loss": 0.0001, "num_tokens": 12094365.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 274.5, "completions/mean_terminated_length": 274.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.911012980061555, "frac_reward_zero_std": 0.5, "grad_norm": 0.7033687491134148, "kl": 0.001941680908203125, "learning_rate": 1.2172677973829433e-07, "loss": 0.0094, "num_tokens": 12097513.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 404.625, "completions/mean_terminated_length": 404.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9112806101967081, "frac_reward_zero_std": 1.0, "grad_norm": 0.015469868366368243, "kl": 0.001178741455078125, "learning_rate": 1.2159792331227404e-07, "loss": 0.0, "num_tokens": 12101730.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 470.0, "completions/max_terminated_length": 470.0, "completions/mean_length": 313.125, "completions/mean_terminated_length": 313.125, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.9115482403318613, "frac_reward_zero_std": 1.0, "grad_norm": 0.025849377914843423, "kl": 0.0012664794921875, "learning_rate": 1.21469440736393e-07, "loss": 0.0001, "num_tokens": 12105219.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 283.0, "completions/max_terminated_length": 283.0, "completions/mean_length": 193.5, "completions/mean_terminated_length": 193.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.9118158704670146, "frac_reward_zero_std": 1.0, "grad_norm": 0.04289952154078482, "kl": 0.002349853515625, "learning_rate": 1.2134133212277275e-07, "loss": 0.0001, "num_tokens": 12107623.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 338.0, "completions/mean_terminated_length": 338.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9120835006021678, "frac_reward_zero_std": 1.0, "grad_norm": 0.013985877672396911, "kl": 0.001361846923828125, "learning_rate": 1.212135975832091e-07, "loss": 0.0001, "num_tokens": 12111239.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 486.0, "completions/max_terminated_length": 486.0, "completions/mean_length": 305.75, "completions/mean_terminated_length": 305.75, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.912351130737321, "frac_reward_zero_std": 1.0, "grad_norm": 0.021606811266900512, "kl": 0.0014362335205078125, "learning_rate": 1.2108623722917092e-07, "loss": 0.0001, "num_tokens": 12114713.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 627.0, "completions/max_terminated_length": 627.0, "completions/mean_length": 381.125, "completions/mean_terminated_length": 381.125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.9126187608724743, "frac_reward_zero_std": 1.0, "grad_norm": 0.009234493009108873, "kl": 0.000759124755859375, "learning_rate": 1.2095925117180073e-07, "loss": 0.0, "num_tokens": 12118734.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 396.0, "completions/max_terminated_length": 396.0, "completions/mean_length": 316.125, "completions/mean_terminated_length": 316.125, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.9128863910076275, "frac_reward_zero_std": 1.0, "grad_norm": 0.011249884230486751, "kl": 0.00103759765625, "learning_rate": 1.2083263952191446e-07, "loss": 0.0, "num_tokens": 12122227.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 422.625, "completions/mean_terminated_length": 422.625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.9131540211427807, "frac_reward_zero_std": 0.5, "grad_norm": 0.6643507675619587, "kl": 0.0015716552734375, "learning_rate": 1.2070640239000128e-07, "loss": 0.0563, "num_tokens": 12126988.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 327.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9134216512779338, "frac_reward_zero_std": 1.0, "grad_norm": 0.029737726458130158, "kl": 0.0020599365234375, "learning_rate": 1.2058053988622348e-07, "loss": 0.0001, "num_tokens": 12130681.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 225.5, "completions/mean_terminated_length": 225.5, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.9136892814130871, "frac_reward_zero_std": 1.0, "grad_norm": 0.011122880758706911, "kl": 0.000728607177734375, "learning_rate": 1.2045505212041643e-07, "loss": 0.0, "num_tokens": 12133469.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 456.0, "completions/max_terminated_length": 456.0, "completions/mean_length": 289.5, "completions/mean_terminated_length": 289.5, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9139569115482403, "frac_reward_zero_std": 1.0, "grad_norm": 0.017244525450052915, "kl": 0.001621246337890625, "learning_rate": 1.2032993920208865e-07, "loss": 0.0001, "num_tokens": 12136857.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 482.0, "completions/max_terminated_length": 482.0, "completions/mean_length": 382.625, "completions/mean_terminated_length": 382.625, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9142245416833935, "frac_reward_zero_std": 1.0, "grad_norm": 0.04748565547472278, "kl": 0.0021820068359375, "learning_rate": 1.2020520124042125e-07, "loss": 0.0001, "num_tokens": 12141010.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 452.0, "completions/max_terminated_length": 452.0, "completions/mean_length": 329.5, "completions/mean_terminated_length": 329.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9144921718185468, "frac_reward_zero_std": 1.0, "grad_norm": 0.021363493669898564, "kl": 0.001346588134765625, "learning_rate": 1.200808383442684e-07, "loss": 0.0001, "num_tokens": 12144786.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 346.0, "completions/max_terminated_length": 346.0, "completions/mean_length": 294.375, "completions/mean_terminated_length": 294.375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.9147598019537, "frac_reward_zero_std": 1.0, "grad_norm": 0.008949313364030499, "kl": 0.000701904296875, "learning_rate": 1.199568506221568e-07, "loss": 0.0, "num_tokens": 12148165.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 375.0, "completions/max_terminated_length": 375.0, "completions/mean_length": 318.375, "completions/mean_terminated_length": 318.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9150274320888532, "frac_reward_zero_std": 1.0, "grad_norm": 0.03398487956169164, "kl": 0.001781463623046875, "learning_rate": 1.1983323818228578e-07, "loss": 0.0001, "num_tokens": 12151588.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 246.625, "completions/mean_terminated_length": 246.625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9152950622240065, "frac_reward_zero_std": 1.0, "grad_norm": 0.022652629586796445, "kl": 0.001514434814453125, "learning_rate": 1.1971000113252723e-07, "loss": 0.0001, "num_tokens": 12154573.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 404.0, "completions/max_terminated_length": 404.0, "completions/mean_length": 283.375, "completions/mean_terminated_length": 283.375, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9155626923591597, "frac_reward_zero_std": 1.0, "grad_norm": 0.010535128428813555, "kl": 0.0009002685546875, "learning_rate": 1.195871395804255e-07, "loss": 0.0, "num_tokens": 12157776.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 354.75, "completions/mean_terminated_length": 354.75, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.9158303224943128, "frac_reward_zero_std": 1.0, "grad_norm": 0.018121255154267317, "kl": 0.0008449554443359375, "learning_rate": 1.1946465363319697e-07, "loss": 0.0, "num_tokens": 12161762.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 248.0, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.916097952629466, "frac_reward_zero_std": 1.0, "grad_norm": 0.0415621968971025, "kl": 0.002227783203125, "learning_rate": 1.1934254339773072e-07, "loss": 0.0001, "num_tokens": 12164782.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 239.375, "completions/mean_terminated_length": 239.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.9163655827646193, "frac_reward_zero_std": 1.0, "grad_norm": 0.014162138442550423, "kl": 0.0009975433349609375, "learning_rate": 1.1922080898058738e-07, "loss": 0.0, "num_tokens": 12167637.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 519.875, "completions/mean_terminated_length": 519.875, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.9166332128997725, "frac_reward_zero_std": 1.0, "grad_norm": 0.013801209200395109, "kl": 0.00171661376953125, "learning_rate": 1.1909945048800017e-07, "loss": 0.0001, "num_tokens": 12172976.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 358.75, "completions/mean_terminated_length": 358.75, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.9169008430349257, "frac_reward_zero_std": 1.0, "grad_norm": 0.010305505984750303, "kl": 0.0007572174072265625, "learning_rate": 1.1897846802587394e-07, "loss": 0.0, "num_tokens": 12177006.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 464.0, "completions/max_terminated_length": 464.0, "completions/mean_length": 266.5, "completions/mean_terminated_length": 266.5, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.917168473170079, "frac_reward_zero_std": 1.0, "grad_norm": 0.033369882827611726, "kl": 0.0014629364013671875, "learning_rate": 1.1885786169978546e-07, "loss": 0.0001, "num_tokens": 12180070.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 297.625, "completions/mean_terminated_length": 297.625, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9174361033052322, "frac_reward_zero_std": 1.0, "grad_norm": 0.013090699387639395, "kl": 0.001056671142578125, "learning_rate": 1.1873763161498336e-07, "loss": 0.0, "num_tokens": 12183443.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9177037334403854, "frac_reward_zero_std": 1.0, "grad_norm": 0.012038627705949642, "kl": 0.00107574462890625, "learning_rate": 1.1861777787638762e-07, "loss": 0.0, "num_tokens": 12186581.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 256.5, "completions/mean_terminated_length": 256.5, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9179713635755387, "frac_reward_zero_std": 1.0, "grad_norm": 0.015050233298267202, "kl": 0.001438140869140625, "learning_rate": 1.1849830058859026e-07, "loss": 0.0001, "num_tokens": 12189489.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 280.75, "completions/mean_terminated_length": 280.75, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.9182389937106918, "frac_reward_zero_std": 1.0, "grad_norm": 0.029268725837711045, "kl": 0.00159454345703125, "learning_rate": 1.1837919985585432e-07, "loss": 0.0001, "num_tokens": 12192871.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 330.125, "completions/mean_terminated_length": 330.125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.918506623845845, "frac_reward_zero_std": 1.0, "grad_norm": 0.011321364096758347, "kl": 0.000972747802734375, "learning_rate": 1.1826047578211473e-07, "loss": 0.0, "num_tokens": 12196652.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9187742539809982, "frac_reward_zero_std": 1.0, "grad_norm": 0.01676873638394615, "kl": 0.00106048583984375, "learning_rate": 1.1814212847097725e-07, "loss": 0.0, "num_tokens": 12200114.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 287.625, "completions/mean_terminated_length": 287.625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.9190418841161515, "frac_reward_zero_std": 1.0, "grad_norm": 0.013919045291710187, "kl": 0.00140380859375, "learning_rate": 1.1802415802571914e-07, "loss": 0.0001, "num_tokens": 12203483.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 243.0, "completions/mean_terminated_length": 243.0, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9193095142513047, "frac_reward_zero_std": 0.5, "grad_norm": 0.8220392779998341, "kl": 0.00130462646484375, "learning_rate": 1.1790656454928867e-07, "loss": 0.0251, "num_tokens": 12206499.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 391.875, "completions/mean_terminated_length": 391.875, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "epoch": 0.9195771443864579, "frac_reward_zero_std": 1.0, "grad_norm": 0.02303724189159565, "kl": 0.0018463134765625, "learning_rate": 1.1778934814430517e-07, "loss": 0.0001, "num_tokens": 12210698.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 332.0, "completions/max_terminated_length": 332.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.9198447745216112, "frac_reward_zero_std": 1.0, "grad_norm": 0.009118504088822688, "kl": 0.0005931854248046875, "learning_rate": 1.1767250891305906e-07, "loss": 0.0, "num_tokens": 12213729.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 337.875, "completions/mean_terminated_length": 337.875, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9201124046567644, "frac_reward_zero_std": 1.0, "grad_norm": 0.6856827107556583, "kl": 0.0087432861328125, "learning_rate": 1.1755604695751134e-07, "loss": 0.0004, "num_tokens": 12217604.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 339.0, "completions/max_terminated_length": 339.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9203800347919175, "frac_reward_zero_std": 1.0, "grad_norm": 0.1105013280478401, "kl": 0.002796173095703125, "learning_rate": 1.174399623792939e-07, "loss": 0.0001, "num_tokens": 12221102.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 253.75, "completions/mean_terminated_length": 253.75, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.9206476649270708, "frac_reward_zero_std": 1.0, "grad_norm": 0.07783444359022242, "kl": 0.00374603271484375, "learning_rate": 1.1732425527970935e-07, "loss": 0.0002, "num_tokens": 12224072.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 414.0, "completions/mean_terminated_length": 414.0, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.920915295062224, "frac_reward_zero_std": 0.5, "grad_norm": 0.616023961906555, "kl": 0.00188446044921875, "learning_rate": 1.1720892575973095e-07, "loss": 0.0696, "num_tokens": 12228500.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 344.5, "completions/mean_terminated_length": 344.5, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.9211829251973772, "frac_reward_zero_std": 0.0, "grad_norm": 1.41093223727066, "kl": 0.00140380859375, "learning_rate": 1.1709397392000221e-07, "loss": 0.0554, "num_tokens": 12232596.0, "reward": 0.5, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 337.625, "completions/mean_terminated_length": 337.625, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.9214505553325304, "frac_reward_zero_std": 1.0, "grad_norm": 0.02002576541463887, "kl": 0.0010242462158203125, "learning_rate": 1.169793998608373e-07, "loss": 0.0, "num_tokens": 12236405.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 340.375, "completions/mean_terminated_length": 340.375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.9217181854676837, "frac_reward_zero_std": 0.5, "grad_norm": 1.2790181226074675, "kl": 0.00115966796875, "learning_rate": 1.1686520368222067e-07, "loss": -0.0204, "num_tokens": 12240060.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 318.0, "completions/max_terminated_length": 318.0, "completions/mean_length": 214.5, "completions/mean_terminated_length": 214.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9219858156028369, "frac_reward_zero_std": 1.0, "grad_norm": 0.0179277310230129, "kl": 0.0013523101806640625, "learning_rate": 1.1675138548380691e-07, "loss": 0.0001, "num_tokens": 12242900.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 555.0, "completions/max_terminated_length": 555.0, "completions/mean_length": 406.375, "completions/mean_terminated_length": 406.375, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.9222534457379901, "frac_reward_zero_std": 0.5, "grad_norm": 1.1418635697600261, "kl": 0.0011959075927734375, "learning_rate": 1.1663794536492083e-07, "loss": 0.0694, "num_tokens": 12247387.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 358.625, "completions/mean_terminated_length": 358.625, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.9225210758731434, "frac_reward_zero_std": 0.5, "grad_norm": 0.9919497717870494, "kl": 0.00225067138671875, "learning_rate": 1.1652488342455724e-07, "loss": -0.0745, "num_tokens": 12251516.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 322.25, "completions/mean_terminated_length": 322.25, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.9227887060082965, "frac_reward_zero_std": 1.0, "grad_norm": 0.012545378842148852, "kl": 0.001117706298828125, "learning_rate": 1.1641219976138109e-07, "loss": 0.0, "num_tokens": 12255330.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 278.375, "completions/mean_terminated_length": 278.375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9230563361434497, "frac_reward_zero_std": 1.0, "grad_norm": 0.020144382134732467, "kl": 0.00183868408203125, "learning_rate": 1.16299894473727e-07, "loss": 0.0001, "num_tokens": 12258661.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 525.0, "completions/max_terminated_length": 525.0, "completions/mean_length": 404.25, "completions/mean_terminated_length": 404.25, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.923323966278603, "frac_reward_zero_std": 1.0, "grad_norm": 0.014002951629543544, "kl": 0.001617431640625, "learning_rate": 1.1618796765959958e-07, "loss": 0.0001, "num_tokens": 12263095.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 481.0, "completions/max_terminated_length": 481.0, "completions/mean_length": 311.875, "completions/mean_terminated_length": 311.875, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9235915964137562, "frac_reward_zero_std": 1.0, "grad_norm": 0.013613035995927752, "kl": 0.0008907318115234375, "learning_rate": 1.1607641941667304e-07, "loss": 0.0, "num_tokens": 12266458.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 225.75, "completions/mean_terminated_length": 225.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9238592265489094, "frac_reward_zero_std": 1.0, "grad_norm": 0.017389993859362286, "kl": 0.001621246337890625, "learning_rate": 1.1596524984229137e-07, "loss": 0.0001, "num_tokens": 12269160.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9241268566840626, "frac_reward_zero_std": 0.5, "grad_norm": 0.9249239393296195, "kl": 0.00107574462890625, "learning_rate": 1.1585445903346783e-07, "loss": 0.0299, "num_tokens": 12272621.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 316.125, "completions/mean_terminated_length": 316.125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9243944868192159, "frac_reward_zero_std": 1.0, "grad_norm": 0.011638278412916854, "kl": 0.00078582763671875, "learning_rate": 1.1574404708688547e-07, "loss": 0.0, "num_tokens": 12276134.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.9246621169543691, "frac_reward_zero_std": 1.0, "grad_norm": 0.01602979425918778, "kl": 0.00151824951171875, "learning_rate": 1.156340140988966e-07, "loss": 0.0001, "num_tokens": 12279593.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 290.0, "completions/mean_terminated_length": 290.0, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9249297470895222, "frac_reward_zero_std": 1.0, "grad_norm": 0.01070538978213563, "kl": 0.000820159912109375, "learning_rate": 1.1552436016552274e-07, "loss": 0.0, "num_tokens": 12282873.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 311.125, "completions/mean_terminated_length": 311.125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.9251973772246755, "frac_reward_zero_std": 1.0, "grad_norm": 0.015857682864650047, "kl": 0.001071929931640625, "learning_rate": 1.1541508538245473e-07, "loss": 0.0, "num_tokens": 12286418.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 318.625, "completions/mean_terminated_length": 318.625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.9254650073598287, "frac_reward_zero_std": 0.5, "grad_norm": 0.6737619750613171, "kl": 0.000946044921875, "learning_rate": 1.1530618984505242e-07, "loss": -0.0492, "num_tokens": 12289911.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.9257326374949819, "frac_reward_zero_std": 1.0, "grad_norm": 0.045092498111402174, "kl": 0.001728057861328125, "learning_rate": 1.1519767364834495e-07, "loss": 0.0001, "num_tokens": 12293653.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 297.75, "completions/mean_terminated_length": 297.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.9260002676301351, "frac_reward_zero_std": 1.0, "grad_norm": 0.017243061194206242, "kl": 0.001373291015625, "learning_rate": 1.1508953688703006e-07, "loss": 0.0001, "num_tokens": 12297207.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 301.875, "completions/mean_terminated_length": 301.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9262678977652884, "frac_reward_zero_std": 1.0, "grad_norm": 0.03138759360231283, "kl": 0.00231170654296875, "learning_rate": 1.1498177965547476e-07, "loss": 0.0001, "num_tokens": 12300998.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 429.375, "completions/mean_terminated_length": 429.375, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.9265355279004416, "frac_reward_zero_std": 0.5, "grad_norm": 0.7731855479332529, "kl": 0.002231597900390625, "learning_rate": 1.1487440204771453e-07, "loss": 0.0001, "num_tokens": 12305669.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 347.125, "completions/mean_terminated_length": 347.125, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.9268031580355948, "frac_reward_zero_std": 1.0, "grad_norm": 0.02971041425129664, "kl": 0.00164031982421875, "learning_rate": 1.1476740415745382e-07, "loss": 0.0001, "num_tokens": 12309398.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 467.0, "completions/max_terminated_length": 467.0, "completions/mean_length": 368.0, "completions/mean_terminated_length": 368.0, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.9270707881707481, "frac_reward_zero_std": 1.0, "grad_norm": 0.01167517019424822, "kl": 0.0006389617919921875, "learning_rate": 1.1466078607806551e-07, "loss": 0.0, "num_tokens": 12313442.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 317.0, "completions/max_terminated_length": 317.0, "completions/mean_length": 261.5, "completions/mean_terminated_length": 261.5, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.9273384183059012, "frac_reward_zero_std": 1.0, "grad_norm": 0.013617945018568835, "kl": 0.001399993896484375, "learning_rate": 1.1455454790259117e-07, "loss": 0.0001, "num_tokens": 12316638.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 177.25, "completions/mean_terminated_length": 177.25, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.9276060484410544, "frac_reward_zero_std": 1.0, "grad_norm": 0.023138992586376993, "kl": 0.00146484375, "learning_rate": 1.144486897237409e-07, "loss": 0.0001, "num_tokens": 12319056.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 270.875, "completions/mean_terminated_length": 270.875, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.9278736785762077, "frac_reward_zero_std": 1.0, "grad_norm": 0.01693028038398165, "kl": 0.0016326904296875, "learning_rate": 1.1434321163389285e-07, "loss": 0.0001, "num_tokens": 12322359.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9281413087113609, "frac_reward_zero_std": 1.0, "grad_norm": 0.014229549702441925, "kl": 0.001312255859375, "learning_rate": 1.14238113725094e-07, "loss": 0.0001, "num_tokens": 12326090.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 337.125, "completions/mean_terminated_length": 337.125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9284089388465141, "frac_reward_zero_std": 1.0, "grad_norm": 0.010767325188454963, "kl": 0.00090789794921875, "learning_rate": 1.1413339608905906e-07, "loss": 0.0, "num_tokens": 12330055.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 440.0, "completions/max_terminated_length": 440.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.9286765689816673, "frac_reward_zero_std": 0.5, "grad_norm": 0.6672307983740743, "kl": 0.00113677978515625, "learning_rate": 1.1402905881717126e-07, "loss": -0.0064, "num_tokens": 12333971.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 340.5, "completions/mean_terminated_length": 340.5, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.9289441991168206, "frac_reward_zero_std": 1.0, "grad_norm": 0.014360215849341313, "kl": 0.00168609619140625, "learning_rate": 1.1392510200048167e-07, "loss": 0.0001, "num_tokens": 12337743.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 274.125, "completions/mean_terminated_length": 274.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.9292118292519738, "frac_reward_zero_std": 1.0, "grad_norm": 0.01357715081686496, "kl": 0.0010738372802734375, "learning_rate": 1.1382152572970948e-07, "loss": 0.0, "num_tokens": 12340852.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 356.125, "completions/mean_terminated_length": 356.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.929479459387127, "frac_reward_zero_std": 0.5, "grad_norm": 0.6184031832265503, "kl": 0.00122833251953125, "learning_rate": 1.1371833009524173e-07, "loss": -0.0258, "num_tokens": 12344745.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.9297470895222802, "frac_reward_zero_std": 1.0, "grad_norm": 0.01985846698226978, "kl": 0.001399993896484375, "learning_rate": 1.1361551518713331e-07, "loss": 0.0001, "num_tokens": 12347639.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 304.375, "completions/mean_terminated_length": 304.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9300147196574334, "frac_reward_zero_std": 1.0, "grad_norm": 0.007733541995419877, "kl": 0.00046062469482421875, "learning_rate": 1.1351308109510688e-07, "loss": 0.0, "num_tokens": 12351038.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9302823497925866, "frac_reward_zero_std": 1.0, "grad_norm": 0.013116730093929796, "kl": 0.00135040283203125, "learning_rate": 1.134110279085527e-07, "loss": 0.0001, "num_tokens": 12354360.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 373.375, "completions/mean_terminated_length": 373.375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.9305499799277399, "frac_reward_zero_std": 0.5, "grad_norm": 0.9175994976886448, "kl": 0.001659393310546875, "learning_rate": 1.1330935571652878e-07, "loss": -0.0084, "num_tokens": 12358515.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.9308176100628931, "frac_reward_zero_std": 1.0, "grad_norm": 0.020203386868739463, "kl": 0.001201629638671875, "learning_rate": 1.1320806460776054e-07, "loss": 0.0, "num_tokens": 12361667.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.9310852401980463, "frac_reward_zero_std": 1.0, "grad_norm": 0.020840203586345477, "kl": 0.00164794921875, "learning_rate": 1.1310715467064084e-07, "loss": 0.0001, "num_tokens": 12365707.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.9313528703331995, "frac_reward_zero_std": 1.0, "grad_norm": 0.02342461726923771, "kl": 0.002361297607421875, "learning_rate": 1.1300662599322992e-07, "loss": 0.0001, "num_tokens": 12368747.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 304.125, "completions/mean_terminated_length": 304.125, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.9316205004683528, "frac_reward_zero_std": 1.0, "grad_norm": 0.013044345179383866, "kl": 0.00128173828125, "learning_rate": 1.1290647866325535e-07, "loss": 0.0001, "num_tokens": 12372424.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.931888130603506, "frac_reward_zero_std": 1.0, "grad_norm": 0.013038647012658992, "kl": 0.0009250640869140625, "learning_rate": 1.1280671276811186e-07, "loss": 0.0, "num_tokens": 12375700.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 344.25, "completions/mean_terminated_length": 344.25, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9321557607386591, "frac_reward_zero_std": 1.0, "grad_norm": 0.014749432793046469, "kl": 0.0013580322265625, "learning_rate": 1.1270732839486137e-07, "loss": 0.0001, "num_tokens": 12379406.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 453.0, "completions/max_terminated_length": 453.0, "completions/mean_length": 258.5, "completions/mean_terminated_length": 258.5, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9324233908738124, "frac_reward_zero_std": 1.0, "grad_norm": 0.01980614818650539, "kl": 0.0014495849609375, "learning_rate": 1.1260832563023275e-07, "loss": 0.0001, "num_tokens": 12382402.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9326910210089656, "frac_reward_zero_std": 1.0, "grad_norm": 0.006724947372117632, "kl": 0.0004730224609375, "learning_rate": 1.1250970456062196e-07, "loss": 0.0, "num_tokens": 12385505.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.9329586511441188, "frac_reward_zero_std": 1.0, "grad_norm": 0.030310756344861793, "kl": 0.00240325927734375, "learning_rate": 1.1241146527209192e-07, "loss": 0.0001, "num_tokens": 12388226.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 382.0, "completions/max_terminated_length": 382.0, "completions/mean_length": 301.0, "completions/mean_terminated_length": 301.0, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.9332262812792721, "frac_reward_zero_std": 0.5, "grad_norm": 1.0540866187904896, "kl": 0.001834869384765625, "learning_rate": 1.1231360785037232e-07, "loss": 0.0078, "num_tokens": 12391722.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 491.0, "completions/max_terminated_length": 491.0, "completions/mean_length": 387.75, "completions/mean_terminated_length": 387.75, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.9334939114144253, "frac_reward_zero_std": 0.5, "grad_norm": 0.6346990108756493, "kl": 0.0010776519775390625, "learning_rate": 1.1221613238085952e-07, "loss": -0.0426, "num_tokens": 12395924.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 575.0, "completions/max_terminated_length": 575.0, "completions/mean_length": 340.25, "completions/mean_terminated_length": 340.25, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.9337615415495785, "frac_reward_zero_std": 1.0, "grad_norm": 0.01255651155249612, "kl": 0.0010242462158203125, "learning_rate": 1.1211903894861654e-07, "loss": 0.0, "num_tokens": 12399666.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 328.0, "completions/max_terminated_length": 328.0, "completions/mean_length": 273.125, "completions/mean_terminated_length": 273.125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.9340291716847317, "frac_reward_zero_std": 1.0, "grad_norm": 0.029347844978218848, "kl": 0.0018157958984375, "learning_rate": 1.1202232763837336e-07, "loss": 0.0001, "num_tokens": 12402919.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 238.0, "completions/mean_terminated_length": 238.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.934296801819885, "frac_reward_zero_std": 1.0, "grad_norm": 0.02777332922317929, "kl": 0.0017852783203125, "learning_rate": 1.11925998534526e-07, "loss": 0.0001, "num_tokens": 12405787.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 359.0, "completions/max_terminated_length": 359.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 275.5, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9345644319550381, "frac_reward_zero_std": 1.0, "grad_norm": 0.017537391664068775, "kl": 0.0010089874267578125, "learning_rate": 1.1183005172113744e-07, "loss": 0.0, "num_tokens": 12408995.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 306.0, "completions/max_terminated_length": 306.0, "completions/mean_length": 250.625, "completions/mean_terminated_length": 250.625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.9348320620901913, "frac_reward_zero_std": 1.0, "grad_norm": 0.010805451998257538, "kl": 0.001003265380859375, "learning_rate": 1.1173448728193659e-07, "loss": 0.0, "num_tokens": 12412056.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 324.5, "completions/mean_terminated_length": 324.5, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.9350996922253446, "frac_reward_zero_std": 1.0, "grad_norm": 0.013883080685420693, "kl": 0.001220703125, "learning_rate": 1.1163930530031894e-07, "loss": 0.0, "num_tokens": 12415660.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 522.0, "completions/max_terminated_length": 522.0, "completions/mean_length": 410.75, "completions/mean_terminated_length": 410.75, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.9353673223604978, "frac_reward_zero_std": 1.0, "grad_norm": 0.0121619768163617, "kl": 0.0011444091796875, "learning_rate": 1.1154450585934624e-07, "loss": 0.0, "num_tokens": 12419974.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 432.375, "completions/mean_terminated_length": 432.375, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.935634952495651, "frac_reward_zero_std": 1.0, "grad_norm": 0.009572043481667842, "kl": 0.0008182525634765625, "learning_rate": 1.114500890417463e-07, "loss": 0.0, "num_tokens": 12424557.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 279.375, "completions/mean_terminated_length": 279.375, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.9359025826308043, "frac_reward_zero_std": 1.0, "grad_norm": 0.01516017618211087, "kl": 0.001506805419921875, "learning_rate": 1.1135605492991313e-07, "loss": 0.0001, "num_tokens": 12427792.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 511.0, "completions/max_terminated_length": 511.0, "completions/mean_length": 342.75, "completions/mean_terminated_length": 342.75, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.9361702127659575, "frac_reward_zero_std": 1.0, "grad_norm": 0.01830429100877258, "kl": 0.00128173828125, "learning_rate": 1.1126240360590658e-07, "loss": 0.0001, "num_tokens": 12431542.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 307.0, "completions/mean_terminated_length": 307.0, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.9364378429011107, "frac_reward_zero_std": 1.0, "grad_norm": 0.019980616259090225, "kl": 0.001384735107421875, "learning_rate": 1.1116913515145271e-07, "loss": 0.0001, "num_tokens": 12434958.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 546.0, "completions/max_terminated_length": 546.0, "completions/mean_length": 347.5, "completions/mean_terminated_length": 347.5, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.9367054730362638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0203798256200669, "kl": 0.001468658447265625, "learning_rate": 1.1107624964794324e-07, "loss": 0.0001, "num_tokens": 12438974.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 357.125, "completions/mean_terminated_length": 357.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9369731031714171, "frac_reward_zero_std": 1.0, "grad_norm": 0.013785888048457635, "kl": 0.001491546630859375, "learning_rate": 1.1098374717643588e-07, "loss": 0.0001, "num_tokens": 12442843.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 279.0, "completions/max_terminated_length": 279.0, "completions/mean_length": 226.375, "completions/mean_terminated_length": 226.375, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.9372407333065703, "frac_reward_zero_std": 1.0, "grad_norm": 0.020367211462281626, "kl": 0.00179290771484375, "learning_rate": 1.1089162781765397e-07, "loss": 0.0001, "num_tokens": 12445746.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 324.0, "completions/max_terminated_length": 324.0, "completions/mean_length": 283.0, "completions/mean_terminated_length": 283.0, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.9375083634417235, "frac_reward_zero_std": 1.0, "grad_norm": 0.014997819537607236, "kl": 0.00104522705078125, "learning_rate": 1.1079989165198659e-07, "loss": 0.0, "num_tokens": 12448970.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 276.0, "completions/max_terminated_length": 276.0, "completions/mean_length": 231.125, "completions/mean_terminated_length": 231.125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.9377759935768768, "frac_reward_zero_std": 1.0, "grad_norm": 0.027029734235231678, "kl": 0.0017452239990234375, "learning_rate": 1.1070853875948838e-07, "loss": 0.0001, "num_tokens": 12451735.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 278.125, "completions/mean_terminated_length": 278.125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.93804362371203, "frac_reward_zero_std": 1.0, "grad_norm": 0.015342798489814657, "kl": 0.0010967254638671875, "learning_rate": 1.106175692198795e-07, "loss": 0.0, "num_tokens": 12454968.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 286.375, "completions/mean_terminated_length": 286.375, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9383112538471832, "frac_reward_zero_std": 0.5, "grad_norm": 0.9787171800734283, "kl": 0.0012664794921875, "learning_rate": 1.1052698311254558e-07, "loss": -0.0403, "num_tokens": 12458295.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 278.0, "completions/max_terminated_length": 278.0, "completions/mean_length": 255.0, "completions/mean_terminated_length": 255.0, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.9385788839823365, "frac_reward_zero_std": 1.0, "grad_norm": 0.01417226146701318, "kl": 0.001224517822265625, "learning_rate": 1.1043678051653768e-07, "loss": 0.0, "num_tokens": 12461351.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 266.875, "completions/mean_terminated_length": 266.875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9388465141174896, "frac_reward_zero_std": 1.0, "grad_norm": 0.01747598408046655, "kl": 0.0014190673828125, "learning_rate": 1.103469615105722e-07, "loss": 0.0001, "num_tokens": 12464450.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 291.0, "completions/max_terminated_length": 291.0, "completions/mean_length": 247.375, "completions/mean_terminated_length": 247.375, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.9391141442526428, "frac_reward_zero_std": 1.0, "grad_norm": 0.014194920437561192, "kl": 0.0012969970703125, "learning_rate": 1.102575261730307e-07, "loss": 0.0001, "num_tokens": 12467377.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 271.625, "completions/mean_terminated_length": 271.625, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.939381774387796, "frac_reward_zero_std": 1.0, "grad_norm": 0.010837559326633205, "kl": 0.0010128021240234375, "learning_rate": 1.1016847458195999e-07, "loss": 0.0, "num_tokens": 12470586.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 287.375, "completions/mean_terminated_length": 287.375, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.9396494045229493, "frac_reward_zero_std": 1.0, "grad_norm": 0.02898925758809539, "kl": 0.002178192138671875, "learning_rate": 1.1007980681507204e-07, "loss": 0.0001, "num_tokens": 12474121.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 684.0, "completions/max_terminated_length": 684.0, "completions/mean_length": 516.5, "completions/mean_terminated_length": 516.5, "completions/min_length": 351.0, "completions/min_terminated_length": 351.0, "epoch": 0.9399170346581025, "frac_reward_zero_std": 0.5, "grad_norm": 0.5861671984539168, "kl": 0.001251220703125, "learning_rate": 1.0999152294974387e-07, "loss": 0.0001, "num_tokens": 12479389.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 359.375, "completions/mean_terminated_length": 359.375, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9401846647932557, "frac_reward_zero_std": 1.0, "grad_norm": 0.02002443752579469, "kl": 0.0013275146484375, "learning_rate": 1.0990362306301726e-07, "loss": 0.0001, "num_tokens": 12483312.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 475.0, "completions/max_terminated_length": 475.0, "completions/mean_length": 351.375, "completions/mean_terminated_length": 351.375, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.940452294928409, "frac_reward_zero_std": 1.0, "grad_norm": 0.016597557034900144, "kl": 0.0010986328125, "learning_rate": 1.098161072315992e-07, "loss": 0.0, "num_tokens": 12487247.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 258.0, "completions/mean_terminated_length": 258.0, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.9407199250635622, "frac_reward_zero_std": 1.0, "grad_norm": 0.013629325585829431, "kl": 0.000957489013671875, "learning_rate": 1.0972897553186142e-07, "loss": 0.0, "num_tokens": 12490311.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 349.875, "completions/mean_terminated_length": 349.875, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.9409875551987154, "frac_reward_zero_std": 1.0, "grad_norm": 0.025039258656098004, "kl": 0.00164794921875, "learning_rate": 1.0964222803984048e-07, "loss": 0.0001, "num_tokens": 12494382.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.9412551853338685, "frac_reward_zero_std": 1.0, "grad_norm": 0.02544284790148124, "kl": 0.00194549560546875, "learning_rate": 1.0955586483123752e-07, "loss": 0.0001, "num_tokens": 12497587.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 400.125, "completions/mean_terminated_length": 400.125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.9415228154690218, "frac_reward_zero_std": 1.0, "grad_norm": 0.013404739777241726, "kl": 0.001689910888671875, "learning_rate": 1.0946988598141836e-07, "loss": 0.0001, "num_tokens": 12501928.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 346.25, "completions/mean_terminated_length": 346.25, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.941790445604175, "frac_reward_zero_std": 0.5, "grad_norm": 0.820164275185158, "kl": 0.0020599365234375, "learning_rate": 1.0938429156541364e-07, "loss": 0.0607, "num_tokens": 12505946.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 629.0, "completions/max_terminated_length": 629.0, "completions/mean_length": 330.125, "completions/mean_terminated_length": 330.125, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.9420580757393282, "frac_reward_zero_std": 1.0, "grad_norm": 0.032479166820444445, "kl": 0.00142669677734375, "learning_rate": 1.0929908165791815e-07, "loss": 0.0001, "num_tokens": 12509767.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 234.625, "completions/mean_terminated_length": 234.625, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.9423257058744815, "frac_reward_zero_std": 1.0, "grad_norm": 0.018292072105201947, "kl": 0.00121307373046875, "learning_rate": 1.0921425633329145e-07, "loss": 0.0, "num_tokens": 12512648.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 570.0, "completions/max_terminated_length": 570.0, "completions/mean_length": 395.125, "completions/mean_terminated_length": 395.125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9425933360096347, "frac_reward_zero_std": 1.0, "grad_norm": 0.015012198657853765, "kl": 0.00138092041015625, "learning_rate": 1.0912981566555735e-07, "loss": 0.0001, "num_tokens": 12516901.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 344.25, "completions/mean_terminated_length": 344.25, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9428609661447879, "frac_reward_zero_std": 1.0, "grad_norm": 0.018536418259587704, "kl": 0.001514434814453125, "learning_rate": 1.090457597284039e-07, "loss": 0.0001, "num_tokens": 12520787.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 352.375, "completions/mean_terminated_length": 352.375, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9431285962799412, "frac_reward_zero_std": 1.0, "grad_norm": 0.016570354455770996, "kl": 0.00148773193359375, "learning_rate": 1.0896208859518364e-07, "loss": 0.0001, "num_tokens": 12524642.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 406.875, "completions/mean_terminated_length": 406.875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.9433962264150944, "frac_reward_zero_std": 0.5, "grad_norm": 0.6605340044297661, "kl": 0.001068115234375, "learning_rate": 1.0887880233891306e-07, "loss": 0.0017, "num_tokens": 12529233.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 515.0, "completions/max_terminated_length": 515.0, "completions/mean_length": 387.125, "completions/mean_terminated_length": 387.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.9436638565502475, "frac_reward_zero_std": 1.0, "grad_norm": 0.02762576003056751, "kl": 0.001384735107421875, "learning_rate": 1.0879590103227302e-07, "loss": 0.0001, "num_tokens": 12533378.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 331.0, "completions/max_terminated_length": 331.0, "completions/mean_length": 216.375, "completions/mean_terminated_length": 216.375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.9439314866854007, "frac_reward_zero_std": 1.0, "grad_norm": 0.016775156024908942, "kl": 0.001537322998046875, "learning_rate": 1.0871338474760824e-07, "loss": 0.0001, "num_tokens": 12536029.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 245.5, "completions/mean_terminated_length": 245.5, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.944199116820554, "frac_reward_zero_std": 1.0, "grad_norm": 0.010878049943721671, "kl": 0.000843048095703125, "learning_rate": 1.0863125355692749e-07, "loss": 0.0, "num_tokens": 12538937.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 400.0, "completions/max_terminated_length": 400.0, "completions/mean_length": 297.5, "completions/mean_terminated_length": 297.5, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9444667469557072, "frac_reward_zero_std": 1.0, "grad_norm": 0.018098152221135415, "kl": 0.001312255859375, "learning_rate": 1.0854950753190366e-07, "loss": 0.0001, "num_tokens": 12542245.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 411.5, "completions/mean_terminated_length": 411.5, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.9447343770908604, "frac_reward_zero_std": 1.0, "grad_norm": 0.01360874926330972, "kl": 0.001140594482421875, "learning_rate": 1.0846814674387322e-07, "loss": 0.0, "num_tokens": 12546773.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 307.875, "completions/mean_terminated_length": 307.875, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9450020072260137, "frac_reward_zero_std": 0.5, "grad_norm": 1.1421721908992508, "kl": 0.004924774169921875, "learning_rate": 1.0838717126383676e-07, "loss": -0.0024, "num_tokens": 12550384.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.9452696373611669, "frac_reward_zero_std": 1.0, "grad_norm": 0.03045778276343992, "kl": 0.00246429443359375, "learning_rate": 1.0830658116245848e-07, "loss": 0.0001, "num_tokens": 12553796.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 360.375, "completions/mean_terminated_length": 360.375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.94553726749632, "frac_reward_zero_std": 1.0, "grad_norm": 0.0128501607096263, "kl": 0.00128173828125, "learning_rate": 1.0822637651006616e-07, "loss": 0.0001, "num_tokens": 12557811.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 255.0, "completions/max_terminated_length": 255.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.9458048976314734, "frac_reward_zero_std": 1.0, "grad_norm": 0.020135105423480553, "kl": 0.00180816650390625, "learning_rate": 1.0814655737665148e-07, "loss": 0.0001, "num_tokens": 12560667.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 294.875, "completions/mean_terminated_length": 294.875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9460725277666265, "frac_reward_zero_std": 1.0, "grad_norm": 0.021960535951144174, "kl": 0.00197601318359375, "learning_rate": 1.0806712383186945e-07, "loss": 0.0001, "num_tokens": 12564162.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9463401579017797, "frac_reward_zero_std": 0.0, "grad_norm": 1.1056433201647573, "kl": 0.00241851806640625, "learning_rate": 1.0798807594503878e-07, "loss": -0.099, "num_tokens": 12568108.0, "reward": 0.75, "reward_std": 0.5, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 519.0, "completions/max_terminated_length": 519.0, "completions/mean_length": 358.875, "completions/mean_terminated_length": 358.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.9466077880369329, "frac_reward_zero_std": 1.0, "grad_norm": 0.01621139899187077, "kl": 0.001293182373046875, "learning_rate": 1.0790941378514151e-07, "loss": 0.0001, "num_tokens": 12572443.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 442.875, "completions/mean_terminated_length": 442.875, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "epoch": 0.9468754181720862, "frac_reward_zero_std": 0.5, "grad_norm": 0.9335788136543683, "kl": 0.0016021728515625, "learning_rate": 1.0783113742082306e-07, "loss": 0.0001, "num_tokens": 12577198.0, "reward": 0.25, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 799.0, "completions/max_terminated_length": 799.0, "completions/mean_length": 439.625, "completions/mean_terminated_length": 439.625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.9471430483072394, "frac_reward_zero_std": 0.5, "grad_norm": 0.5778689184247937, "kl": 0.0008182525634765625, "learning_rate": 1.077532469203923e-07, "loss": -0.0347, "num_tokens": 12581823.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 875.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 458.75, "completions/mean_terminated_length": 458.75, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.9474106784423926, "frac_reward_zero_std": 1.0, "grad_norm": 0.013684803462260132, "kl": 0.00164031982421875, "learning_rate": 1.0767574235182124e-07, "loss": 0.0001, "num_tokens": 12586537.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 425.25, "completions/mean_terminated_length": 425.25, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.9476783085775459, "frac_reward_zero_std": 1.0, "grad_norm": 0.011193441909445297, "kl": 0.0010223388671875, "learning_rate": 1.0759862378274523e-07, "loss": 0.0, "num_tokens": 12591147.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 434.625, "completions/mean_terminated_length": 434.625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.947945938712699, "frac_reward_zero_std": 1.0, "grad_norm": 0.020054288783217612, "kl": 0.001720428466796875, "learning_rate": 1.0752189128046255e-07, "loss": 0.0001, "num_tokens": 12595840.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 326.375, "completions/mean_terminated_length": 326.375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.9482135688478522, "frac_reward_zero_std": 1.0, "grad_norm": 0.017652595683373167, "kl": 0.001361846923828125, "learning_rate": 1.0744554491193483e-07, "loss": 0.0001, "num_tokens": 12599739.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 263.375, "completions/mean_terminated_length": 263.375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9484811989830055, "frac_reward_zero_std": 1.0, "grad_norm": 0.012315449226744823, "kl": 0.001068115234375, "learning_rate": 1.0736958474378666e-07, "loss": 0.0, "num_tokens": 12602962.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9487488291181587, "frac_reward_zero_std": 1.0, "grad_norm": 0.009720436551282099, "kl": 0.00072479248046875, "learning_rate": 1.0729401084230552e-07, "loss": 0.0, "num_tokens": 12605971.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 230.5, "completions/mean_terminated_length": 230.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9490164592533119, "frac_reward_zero_std": 1.0, "grad_norm": 0.026833905694381956, "kl": 0.00229644775390625, "learning_rate": 1.0721882327344199e-07, "loss": 0.0001, "num_tokens": 12609063.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 339.125, "completions/mean_terminated_length": 339.125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9492840893884651, "frac_reward_zero_std": 1.0, "grad_norm": 0.011530354756187978, "kl": 0.000751495361328125, "learning_rate": 1.0714402210280918e-07, "loss": 0.0, "num_tokens": 12612860.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 343.0, "completions/max_terminated_length": 343.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.9495517195236184, "frac_reward_zero_std": 1.0, "grad_norm": 0.01795936294155302, "kl": 0.001018524169921875, "learning_rate": 1.0706960739568339e-07, "loss": 0.0, "num_tokens": 12615681.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 345.75, "completions/mean_terminated_length": 345.75, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.9498193496587716, "frac_reward_zero_std": 1.0, "grad_norm": 0.019904513211681327, "kl": 0.0009593963623046875, "learning_rate": 1.0699557921700338e-07, "loss": 0.0, "num_tokens": 12619727.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 349.0, "completions/max_terminated_length": 349.0, "completions/mean_length": 245.25, "completions/mean_terminated_length": 245.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9500869797939248, "frac_reward_zero_std": 1.0, "grad_norm": 0.025759883732964332, "kl": 0.002437591552734375, "learning_rate": 1.0692193763137082e-07, "loss": 0.0001, "num_tokens": 12622665.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 366.25, "completions/mean_terminated_length": 366.25, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.950354609929078, "frac_reward_zero_std": 1.0, "grad_norm": 0.006804369930216316, "kl": 0.000583648681640625, "learning_rate": 1.0684868270304983e-07, "loss": 0.0, "num_tokens": 12626671.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 561.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 406.5, "completions/mean_terminated_length": 406.5, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9506222400642312, "frac_reward_zero_std": 1.0, "grad_norm": 0.021003506813797475, "kl": 0.001163482666015625, "learning_rate": 1.0677581449596724e-07, "loss": 0.0, "num_tokens": 12631203.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 400.75, "completions/mean_terminated_length": 400.75, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.9508898701993844, "frac_reward_zero_std": 1.0, "grad_norm": 0.023792198875557582, "kl": 0.0019073486328125, "learning_rate": 1.0670333307371236e-07, "loss": 0.0001, "num_tokens": 12635805.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 293.0, "completions/mean_terminated_length": 293.0, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9511575003345377, "frac_reward_zero_std": 0.5, "grad_norm": 1.0210926061859733, "kl": 0.001255035400390625, "learning_rate": 1.0663123849953685e-07, "loss": -0.0633, "num_tokens": 12639417.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 487.0, "completions/max_terminated_length": 487.0, "completions/mean_length": 337.625, "completions/mean_terminated_length": 337.625, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.9514251304696909, "frac_reward_zero_std": 1.0, "grad_norm": 0.02517926710522011, "kl": 0.00232696533203125, "learning_rate": 1.0655953083635508e-07, "loss": 0.0001, "num_tokens": 12643150.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 230.0, "completions/max_terminated_length": 230.0, "completions/mean_length": 184.375, "completions/mean_terminated_length": 184.375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9516927606048441, "frac_reward_zero_std": 1.0, "grad_norm": 0.01060350091326251, "kl": 0.0006198883056640625, "learning_rate": 1.0648821014674344e-07, "loss": 0.0, "num_tokens": 12645701.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 338.0, "completions/max_terminated_length": 338.0, "completions/mean_length": 284.625, "completions/mean_terminated_length": 284.625, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.9519603907399973, "frac_reward_zero_std": 1.0, "grad_norm": 0.019214922035612474, "kl": 0.001544952392578125, "learning_rate": 1.0641727649294082e-07, "loss": 0.0001, "num_tokens": 12648962.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 343.5, "completions/mean_terminated_length": 343.5, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.9522280208751506, "frac_reward_zero_std": 1.0, "grad_norm": 0.0236994204165644, "kl": 0.00218963623046875, "learning_rate": 1.0634672993684827e-07, "loss": 0.0001, "num_tokens": 12653070.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 403.0, "completions/max_terminated_length": 403.0, "completions/mean_length": 336.5, "completions/mean_terminated_length": 336.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.9524956510103038, "frac_reward_zero_std": 1.0, "grad_norm": 0.014023187478910265, "kl": 0.001338958740234375, "learning_rate": 1.0627657054002917e-07, "loss": 0.0001, "num_tokens": 12656894.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 399.5, "completions/mean_terminated_length": 399.5, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.9527632811454569, "frac_reward_zero_std": 1.0, "grad_norm": 0.016708895425890206, "kl": 0.001255035400390625, "learning_rate": 1.0620679836370888e-07, "loss": 0.0001, "num_tokens": 12661194.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 254.0, "completions/max_terminated_length": 254.0, "completions/mean_length": 225.125, "completions/mean_terminated_length": 225.125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.9530309112806102, "frac_reward_zero_std": 1.0, "grad_norm": 0.04695608271626262, "kl": 0.002246856689453125, "learning_rate": 1.0613741346877496e-07, "loss": 0.0001, "num_tokens": 12663951.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 369.0, "completions/max_terminated_length": 369.0, "completions/mean_length": 260.875, "completions/mean_terminated_length": 260.875, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9532985414157634, "frac_reward_zero_std": 1.0, "grad_norm": 0.032487262376847625, "kl": 0.002193450927734375, "learning_rate": 1.060684159157769e-07, "loss": 0.0001, "num_tokens": 12667038.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 590.0, "completions/max_terminated_length": 590.0, "completions/mean_length": 389.25, "completions/mean_terminated_length": 389.25, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.9535661715509166, "frac_reward_zero_std": 1.0, "grad_norm": 0.014861183144186612, "kl": 0.001804351806640625, "learning_rate": 1.0599980576492626e-07, "loss": 0.0001, "num_tokens": 12671232.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 278.625, "completions/mean_terminated_length": 278.625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9538338016860699, "frac_reward_zero_std": 1.0, "grad_norm": 0.019616037442067982, "kl": 0.00232696533203125, "learning_rate": 1.059315830760965e-07, "loss": 0.0001, "num_tokens": 12674645.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 368.875, "completions/mean_terminated_length": 368.875, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9541014318212231, "frac_reward_zero_std": 0.5, "grad_norm": 0.6340067342451428, "kl": 0.0022430419921875, "learning_rate": 1.0586374790882296e-07, "loss": 0.041, "num_tokens": 12678864.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 445.0, "completions/max_terminated_length": 445.0, "completions/mean_length": 312.875, "completions/mean_terminated_length": 312.875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.9543690619563763, "frac_reward_zero_std": 0.5, "grad_norm": 0.9366828356831423, "kl": 0.001007080078125, "learning_rate": 1.0579630032230278e-07, "loss": 0.0285, "num_tokens": 12682491.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 270.375, "completions/mean_terminated_length": 270.375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.9546366920915295, "frac_reward_zero_std": 1.0, "grad_norm": 0.017727779796505205, "kl": 0.00189208984375, "learning_rate": 1.0572924037539495e-07, "loss": 0.0001, "num_tokens": 12685582.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 366.0, "completions/max_terminated_length": 366.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 262.5, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9549043222266828, "frac_reward_zero_std": 1.0, "grad_norm": 0.012521215162234044, "kl": 0.00124359130859375, "learning_rate": 1.0566256812662006e-07, "loss": 0.0, "num_tokens": 12688674.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 398.0, "completions/max_terminated_length": 398.0, "completions/mean_length": 248.25, "completions/mean_terminated_length": 248.25, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9551719523618359, "frac_reward_zero_std": 1.0, "grad_norm": 0.02012832629608411, "kl": 0.001312255859375, "learning_rate": 1.055962836341605e-07, "loss": 0.0001, "num_tokens": 12691736.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 303.0, "completions/max_terminated_length": 303.0, "completions/mean_length": 242.75, "completions/mean_terminated_length": 242.75, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.9554395824969891, "frac_reward_zero_std": 1.0, "grad_norm": 0.016752496244036973, "kl": 0.001739501953125, "learning_rate": 1.0553038695586017e-07, "loss": 0.0001, "num_tokens": 12694738.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 381.0, "completions/max_terminated_length": 381.0, "completions/mean_length": 258.375, "completions/mean_terminated_length": 258.375, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.9557072126321424, "frac_reward_zero_std": 1.0, "grad_norm": 0.012903561784293366, "kl": 0.00130462646484375, "learning_rate": 1.0546487814922455e-07, "loss": 0.0001, "num_tokens": 12697677.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 247.0, "completions/max_terminated_length": 247.0, "completions/mean_length": 211.75, "completions/mean_terminated_length": 211.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.9559748427672956, "frac_reward_zero_std": 1.0, "grad_norm": 0.019990870351573757, "kl": 0.001056671142578125, "learning_rate": 1.0539975727142077e-07, "loss": 0.0, "num_tokens": 12700295.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 307.25, "completions/mean_terminated_length": 307.25, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.9562424729024488, "frac_reward_zero_std": 1.0, "grad_norm": 0.023792606718713787, "kl": 0.001262664794921875, "learning_rate": 1.0533502437927723e-07, "loss": 0.0001, "num_tokens": 12703841.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 341.125, "completions/mean_terminated_length": 341.125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.9565101030376021, "frac_reward_zero_std": 1.0, "grad_norm": 0.011846444550330716, "kl": 0.00099945068359375, "learning_rate": 1.0527067952928395e-07, "loss": 0.0, "num_tokens": 12707650.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 376.375, "completions/mean_terminated_length": 376.375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 0.9567777331727553, "frac_reward_zero_std": 1.0, "grad_norm": 0.02176981028898757, "kl": 0.0019989013671875, "learning_rate": 1.0520672277759221e-07, "loss": 0.0001, "num_tokens": 12711977.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 340.625, "completions/mean_terminated_length": 340.625, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.9570453633079085, "frac_reward_zero_std": 1.0, "grad_norm": 0.017804593732527667, "kl": 0.001728057861328125, "learning_rate": 1.0514315418001457e-07, "loss": 0.0001, "num_tokens": 12716006.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 364.625, "completions/mean_terminated_length": 364.625, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.9573129934430616, "frac_reward_zero_std": 1.0, "grad_norm": 0.018884885541962007, "kl": 0.001583099365234375, "learning_rate": 1.0507997379202497e-07, "loss": 0.0001, "num_tokens": 12720071.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 416.0, "completions/max_terminated_length": 416.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.9575806235782149, "frac_reward_zero_std": 1.0, "grad_norm": 0.012593481869448316, "kl": 0.000988006591796875, "learning_rate": 1.0501718166875844e-07, "loss": 0.0, "num_tokens": 12722913.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 331.125, "completions/mean_terminated_length": 331.125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9578482537133681, "frac_reward_zero_std": 1.0, "grad_norm": 0.014355300152499296, "kl": 0.000942230224609375, "learning_rate": 1.049547778650114e-07, "loss": 0.0, "num_tokens": 12726558.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 260.0, "completions/mean_terminated_length": 260.0, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9581158838485213, "frac_reward_zero_std": 1.0, "grad_norm": 0.03139073138545464, "kl": 0.002124786376953125, "learning_rate": 1.0489276243524118e-07, "loss": 0.0001, "num_tokens": 12729710.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 275.25, "completions/mean_terminated_length": 275.25, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.9583835139836746, "frac_reward_zero_std": 1.0, "grad_norm": 0.010370199474474743, "kl": 0.00103759765625, "learning_rate": 1.0483113543356628e-07, "loss": 0.0, "num_tokens": 12732824.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 466.0, "completions/max_terminated_length": 466.0, "completions/mean_length": 340.625, "completions/mean_terminated_length": 340.625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.9586511441188278, "frac_reward_zero_std": 1.0, "grad_norm": 0.014064080426079074, "kl": 0.001247406005859375, "learning_rate": 1.0476989691376622e-07, "loss": 0.0, "num_tokens": 12736733.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 412.0, "completions/max_terminated_length": 412.0, "completions/mean_length": 346.0, "completions/mean_terminated_length": 346.0, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.958918774253981, "frac_reward_zero_std": 1.0, "grad_norm": 0.012173790529803509, "kl": 0.001155853271484375, "learning_rate": 1.0470904692928159e-07, "loss": 0.0, "num_tokens": 12740577.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 411.0, "completions/mean_terminated_length": 411.0, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.9591864043891342, "frac_reward_zero_std": 0.5, "grad_norm": 0.8574288331609231, "kl": 0.0011749267578125, "learning_rate": 1.046485855332138e-07, "loss": 0.0961, "num_tokens": 12745041.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 340.0, "completions/max_terminated_length": 340.0, "completions/mean_length": 250.125, "completions/mean_terminated_length": 250.125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.9594540345242875, "frac_reward_zero_std": 1.0, "grad_norm": 0.024143076533526094, "kl": 0.001800537109375, "learning_rate": 1.0458851277832518e-07, "loss": 0.0001, "num_tokens": 12748026.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 353.0, "completions/max_terminated_length": 353.0, "completions/mean_length": 284.5, "completions/mean_terminated_length": 284.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.9597216646594406, "frac_reward_zero_std": 1.0, "grad_norm": 0.013866994167380796, "kl": 0.001033782958984375, "learning_rate": 1.0452882871703898e-07, "loss": 0.0, "num_tokens": 12751286.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 219.0, "completions/max_terminated_length": 219.0, "completions/mean_length": 203.75, "completions/mean_terminated_length": 203.75, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9599892947945938, "frac_reward_zero_std": 1.0, "grad_norm": 0.03288750062442349, "kl": 0.001468658447265625, "learning_rate": 1.0446953340143918e-07, "loss": 0.0001, "num_tokens": 12753836.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 258.25, "completions/mean_terminated_length": 258.25, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.9602569249297471, "frac_reward_zero_std": 1.0, "grad_norm": 0.016471287974568573, "kl": 0.001415252685546875, "learning_rate": 1.0441062688327051e-07, "loss": 0.0001, "num_tokens": 12756914.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 706.0, "completions/max_terminated_length": 706.0, "completions/mean_length": 414.25, "completions/mean_terminated_length": 414.25, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.9605245550649003, "frac_reward_zero_std": 0.5, "grad_norm": 0.5868840191352162, "kl": 0.002147674560546875, "learning_rate": 1.0435210921393853e-07, "loss": 0.0313, "num_tokens": 12761248.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 365.0, "completions/max_terminated_length": 365.0, "completions/mean_length": 261.25, "completions/mean_terminated_length": 261.25, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.9607921852000535, "frac_reward_zero_std": 1.0, "grad_norm": 0.01527438366386424, "kl": 0.001407623291015625, "learning_rate": 1.0429398044450929e-07, "loss": 0.0001, "num_tokens": 12764314.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 451.0, "completions/max_terminated_length": 451.0, "completions/mean_length": 274.0, "completions/mean_terminated_length": 274.0, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9610598153352068, "frac_reward_zero_std": 1.0, "grad_norm": 0.01578638134442897, "kl": 0.00145721435546875, "learning_rate": 1.0423624062570953e-07, "loss": 0.0001, "num_tokens": 12767614.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 541.0, "completions/max_terminated_length": 541.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.96132744547036, "frac_reward_zero_std": 1.0, "grad_norm": 0.015053798388235954, "kl": 0.0010776519775390625, "learning_rate": 1.0417888980792663e-07, "loss": 0.0, "num_tokens": 12771473.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 234.5, "completions/mean_terminated_length": 234.5, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.9615950756055132, "frac_reward_zero_std": 1.0, "grad_norm": 0.02794630659198615, "kl": 0.00185394287109375, "learning_rate": 1.0412192804120851e-07, "loss": 0.0001, "num_tokens": 12774361.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 383.0, "completions/max_terminated_length": 383.0, "completions/mean_length": 247.75, "completions/mean_terminated_length": 247.75, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.9618627057406663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0433197878488009, "kl": 0.001911163330078125, "learning_rate": 1.0406535537526342e-07, "loss": 0.0001, "num_tokens": 12777307.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 415.0, "completions/max_terminated_length": 415.0, "completions/mean_length": 314.5, "completions/mean_terminated_length": 314.5, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9621303358758196, "frac_reward_zero_std": 1.0, "grad_norm": 0.01633291030649872, "kl": 0.0012111663818359375, "learning_rate": 1.0400917185946024e-07, "loss": 0.0, "num_tokens": 12780867.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 261.75, "completions/mean_terminated_length": 261.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.9623979660109728, "frac_reward_zero_std": 1.0, "grad_norm": 0.01353440373593136, "kl": 0.0012664794921875, "learning_rate": 1.0395337754282822e-07, "loss": 0.0001, "num_tokens": 12783929.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 280.625, "completions/mean_terminated_length": 280.625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.962665596146126, "frac_reward_zero_std": 1.0, "grad_norm": 0.011637716477962693, "kl": 0.000919342041015625, "learning_rate": 1.0389797247405678e-07, "loss": 0.0, "num_tokens": 12787106.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 526.0, "completions/max_terminated_length": 526.0, "completions/mean_length": 403.25, "completions/mean_terminated_length": 403.25, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9629332262812793, "frac_reward_zero_std": 1.0, "grad_norm": 0.013956138178982069, "kl": 0.00140380859375, "learning_rate": 1.0384295670149597e-07, "loss": 0.0001, "num_tokens": 12791476.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 418.0, "completions/max_terminated_length": 418.0, "completions/mean_length": 288.375, "completions/mean_terminated_length": 288.375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9632008564164325, "frac_reward_zero_std": 1.0, "grad_norm": 0.021872257920061793, "kl": 0.001560211181640625, "learning_rate": 1.0378833027315587e-07, "loss": 0.0001, "num_tokens": 12794859.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 504.0, "completions/max_terminated_length": 504.0, "completions/mean_length": 338.5, "completions/mean_terminated_length": 338.5, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.9634684865515857, "frac_reward_zero_std": 1.0, "grad_norm": 0.015213555992670514, "kl": 0.0012264251708984375, "learning_rate": 1.0373409323670688e-07, "loss": 0.0, "num_tokens": 12798519.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 337.875, "completions/mean_terminated_length": 337.875, "completions/min_length": 290.0, "completions/min_terminated_length": 290.0, "epoch": 0.963736116686739, "frac_reward_zero_std": 1.0, "grad_norm": 0.010694768726443838, "kl": 0.0007877349853515625, "learning_rate": 1.0368024563947958e-07, "loss": 0.0, "num_tokens": 12802274.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 333.25, "completions/mean_terminated_length": 333.25, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.9640037468218922, "frac_reward_zero_std": 1.0, "grad_norm": 0.0075464454309355465, "kl": 0.000720977783203125, "learning_rate": 1.0362678752846482e-07, "loss": 0.0, "num_tokens": 12805888.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 340.75, "completions/mean_terminated_length": 340.75, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.9642713769570453, "frac_reward_zero_std": 1.0, "grad_norm": 0.013496867469142105, "kl": 0.0012359619140625, "learning_rate": 1.0357371895031331e-07, "loss": 0.0, "num_tokens": 12809842.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 327.75, "completions/mean_terminated_length": 327.75, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "epoch": 0.9645390070921985, "frac_reward_zero_std": 1.0, "grad_norm": 0.010477134757276489, "kl": 0.000606536865234375, "learning_rate": 1.0352103995133607e-07, "loss": 0.0, "num_tokens": 12813440.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 379.75, "completions/mean_terminated_length": 379.75, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.9648066372273518, "frac_reward_zero_std": 1.0, "grad_norm": 0.018264319493832157, "kl": 0.0021820068359375, "learning_rate": 1.0346875057750394e-07, "loss": 0.0001, "num_tokens": 12817758.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 319.125, "completions/mean_terminated_length": 319.125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.965074267362505, "frac_reward_zero_std": 1.0, "grad_norm": 0.020794165991641307, "kl": 0.001247406005859375, "learning_rate": 1.0341685087444805e-07, "loss": 0.0001, "num_tokens": 12821471.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 459.125, "completions/mean_terminated_length": 459.125, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.9653418974976582, "frac_reward_zero_std": 1.0, "grad_norm": 0.014609138357033742, "kl": 0.0009136199951171875, "learning_rate": 1.0336534088745908e-07, "loss": 0.0, "num_tokens": 12826296.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 432.0, "completions/max_terminated_length": 432.0, "completions/mean_length": 289.625, "completions/mean_terminated_length": 289.625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.9656095276328115, "frac_reward_zero_std": 1.0, "grad_norm": 0.027521862147080613, "kl": 0.00146484375, "learning_rate": 1.0331422066148796e-07, "loss": 0.0001, "num_tokens": 12829649.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 282.0, "completions/mean_terminated_length": 282.0, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.9658771577679647, "frac_reward_zero_std": 1.0, "grad_norm": 0.010634935162847536, "kl": 0.0006923675537109375, "learning_rate": 1.0326349024114532e-07, "loss": 0.0, "num_tokens": 12832917.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 459.0, "completions/max_terminated_length": 459.0, "completions/mean_length": 364.375, "completions/mean_terminated_length": 364.375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.9661447879031179, "frac_reward_zero_std": 1.0, "grad_norm": 0.028450162330590122, "kl": 0.00148773193359375, "learning_rate": 1.0321314967070165e-07, "loss": 0.0001, "num_tokens": 12837036.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 460.0, "completions/max_terminated_length": 460.0, "completions/mean_length": 321.125, "completions/mean_terminated_length": 321.125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.9664124180382712, "frac_reward_zero_std": 1.0, "grad_norm": 0.016038177692649595, "kl": 0.001552581787109375, "learning_rate": 1.0316319899408729e-07, "loss": 0.0001, "num_tokens": 12840537.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 252.75, "completions/mean_terminated_length": 252.75, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.9666800481734243, "frac_reward_zero_std": 1.0, "grad_norm": 0.017242633506778498, "kl": 0.00150299072265625, "learning_rate": 1.0311363825489225e-07, "loss": 0.0001, "num_tokens": 12843563.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 225.0, "completions/max_terminated_length": 225.0, "completions/mean_length": 201.125, "completions/mean_terminated_length": 201.125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.9669476783085775, "frac_reward_zero_std": 1.0, "grad_norm": 0.019593938267100958, "kl": 0.00110626220703125, "learning_rate": 1.0306446749636631e-07, "loss": 0.0, "num_tokens": 12846060.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 426.0, "completions/max_terminated_length": 426.0, "completions/mean_length": 333.375, "completions/mean_terminated_length": 333.375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.9672153084437307, "frac_reward_zero_std": 0.5, "grad_norm": 0.6468002919972271, "kl": 0.0009784698486328125, "learning_rate": 1.0301568676141891e-07, "loss": -0.012, "num_tokens": 12849835.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 320.0, "completions/max_terminated_length": 320.0, "completions/mean_length": 238.875, "completions/mean_terminated_length": 238.875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.967482938578884, "frac_reward_zero_std": 1.0, "grad_norm": 0.01759801457171332, "kl": 0.001239776611328125, "learning_rate": 1.0296729609261914e-07, "loss": 0.0, "num_tokens": 12852786.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 281.25, "completions/mean_terminated_length": 281.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.9677505687140372, "frac_reward_zero_std": 1.0, "grad_norm": 0.029679310039336507, "kl": 0.002101898193359375, "learning_rate": 1.0291929553219565e-07, "loss": 0.0001, "num_tokens": 12856304.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 259.25, "completions/mean_terminated_length": 259.25, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.9680181988491904, "frac_reward_zero_std": 1.0, "grad_norm": 0.014705196696006022, "kl": 0.0008563995361328125, "learning_rate": 1.0287168512203673e-07, "loss": 0.0, "num_tokens": 12859470.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 215.75, "completions/mean_terminated_length": 215.75, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9682858289843437, "frac_reward_zero_std": 1.0, "grad_norm": 0.019206369009593025, "kl": 0.00110626220703125, "learning_rate": 1.0282446490369016e-07, "loss": 0.0, "num_tokens": 12862076.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 436.0, "completions/max_terminated_length": 436.0, "completions/mean_length": 291.375, "completions/mean_terminated_length": 291.375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.9685534591194969, "frac_reward_zero_std": 1.0, "grad_norm": 0.021793119924252095, "kl": 0.00222015380859375, "learning_rate": 1.0277763491836329e-07, "loss": 0.0001, "num_tokens": 12865539.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 389.0, "completions/max_terminated_length": 389.0, "completions/mean_length": 328.5, "completions/mean_terminated_length": 328.5, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.96882108925465, "frac_reward_zero_std": 1.0, "grad_norm": 0.013910760547508763, "kl": 0.0007352828979492188, "learning_rate": 1.0273119520692274e-07, "loss": 0.0, "num_tokens": 12869235.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 259.875, "completions/mean_terminated_length": 259.875, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.9690887193898033, "frac_reward_zero_std": 1.0, "grad_norm": 0.010762154429128182, "kl": 0.0009708404541015625, "learning_rate": 1.0268514580989477e-07, "loss": 0.0, "num_tokens": 12872494.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 485.0, "completions/max_terminated_length": 485.0, "completions/mean_length": 372.75, "completions/mean_terminated_length": 372.75, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.9693563495249565, "frac_reward_zero_std": 1.0, "grad_norm": 0.014363162707439146, "kl": 0.0013885498046875, "learning_rate": 1.026394867674649e-07, "loss": 0.0001, "num_tokens": 12876676.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 361.0, "completions/max_terminated_length": 361.0, "completions/mean_length": 264.0, "completions/mean_terminated_length": 264.0, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.9696239796601097, "frac_reward_zero_std": 1.0, "grad_norm": 0.020514880466671, "kl": 0.001346588134765625, "learning_rate": 1.0259421811947799e-07, "loss": 0.0001, "num_tokens": 12879792.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 370.0, "completions/max_terminated_length": 370.0, "completions/mean_length": 301.375, "completions/mean_terminated_length": 301.375, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9698916097952629, "frac_reward_zero_std": 1.0, "grad_norm": 0.03304676177295779, "kl": 0.0016632080078125, "learning_rate": 1.025493399054383e-07, "loss": 0.0001, "num_tokens": 12883123.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 589.0, "completions/max_terminated_length": 589.0, "completions/mean_length": 369.875, "completions/mean_terminated_length": 369.875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9701592399304162, "frac_reward_zero_std": 1.0, "grad_norm": 0.011576777925390304, "kl": 0.001102447509765625, "learning_rate": 1.0250485216450932e-07, "loss": 0.0, "num_tokens": 12887202.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 275.0, "completions/max_terminated_length": 275.0, "completions/mean_length": 212.375, "completions/mean_terminated_length": 212.375, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.9704268700655694, "frac_reward_zero_std": 1.0, "grad_norm": 0.02440233663552773, "kl": 0.00185394287109375, "learning_rate": 1.0246075493551389e-07, "loss": 0.0001, "num_tokens": 12889957.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 258.875, "completions/mean_terminated_length": 258.875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.9706945002007226, "frac_reward_zero_std": 1.0, "grad_norm": 0.07956961567717613, "kl": 0.002864837646484375, "learning_rate": 1.0241704825693384e-07, "loss": 0.0001, "num_tokens": 12893164.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 438.0, "completions/max_terminated_length": 438.0, "completions/mean_length": 336.375, "completions/mean_terminated_length": 336.375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.9709621303358759, "frac_reward_zero_std": 1.0, "grad_norm": 0.01583645612812169, "kl": 0.0014495849609375, "learning_rate": 1.023737321669104e-07, "loss": 0.0001, "num_tokens": 12897079.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 227.5, "completions/mean_terminated_length": 227.5, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.971229760471029, "frac_reward_zero_std": 1.0, "grad_norm": 0.01758837591102169, "kl": 0.001407623291015625, "learning_rate": 1.0233080670324383e-07, "loss": 0.0001, "num_tokens": 12899867.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 404.5, "completions/mean_terminated_length": 404.5, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.9714973906061822, "frac_reward_zero_std": 0.5, "grad_norm": 1.1304398660435167, "kl": 0.0022735595703125, "learning_rate": 1.0228827190339365e-07, "loss": 0.044, "num_tokens": 12904099.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 472.0, "completions/max_terminated_length": 472.0, "completions/mean_length": 362.125, "completions/mean_terminated_length": 362.125, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.9717650207413355, "frac_reward_zero_std": 1.0, "grad_norm": 0.01233651812409467, "kl": 0.0010223388671875, "learning_rate": 1.0224612780447818e-07, "loss": 0.0, "num_tokens": 12907956.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 324.25, "completions/mean_terminated_length": 324.25, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.9720326508764887, "frac_reward_zero_std": 1.0, "grad_norm": 0.011828869148588777, "kl": 0.0005846023559570312, "learning_rate": 1.0220437444327509e-07, "loss": 0.0, "num_tokens": 12911630.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 334.0, "completions/max_terminated_length": 334.0, "completions/mean_length": 281.125, "completions/mean_terminated_length": 281.125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9723002810116419, "frac_reward_zero_std": 1.0, "grad_norm": 0.02444517941176222, "kl": 0.00162506103515625, "learning_rate": 1.0216301185622094e-07, "loss": 0.0001, "num_tokens": 12914863.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 430.0, "completions/max_terminated_length": 430.0, "completions/mean_length": 382.375, "completions/mean_terminated_length": 382.375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.9725679111467951, "frac_reward_zero_std": 1.0, "grad_norm": 0.01362768889977658, "kl": 0.0008449554443359375, "learning_rate": 1.0212204007941121e-07, "loss": 0.0, "num_tokens": 12919178.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9728355412819484, "frac_reward_zero_std": 1.0, "grad_norm": 0.009733476315273385, "kl": 0.0010986328125, "learning_rate": 1.0208145914860049e-07, "loss": 0.0, "num_tokens": 12923294.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 239.875, "completions/mean_terminated_length": 239.875, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.9731031714171016, "frac_reward_zero_std": 1.0, "grad_norm": 0.033420129562970656, "kl": 0.00194549560546875, "learning_rate": 1.0204126909920216e-07, "loss": 0.0001, "num_tokens": 12926437.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 322.375, "completions/mean_terminated_length": 322.375, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.9733708015522547, "frac_reward_zero_std": 1.0, "grad_norm": 0.017350884027011407, "kl": 0.001373291015625, "learning_rate": 1.0200146996628855e-07, "loss": 0.0001, "num_tokens": 12930248.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 330.0, "completions/max_terminated_length": 330.0, "completions/mean_length": 211.5, "completions/mean_terminated_length": 211.5, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.973638431687408, "frac_reward_zero_std": 0.5, "grad_norm": 0.8720722362868226, "kl": 0.0021514892578125, "learning_rate": 1.0196206178459085e-07, "loss": 0.008, "num_tokens": 12932872.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 376.0, "completions/max_terminated_length": 376.0, "completions/mean_length": 282.5, "completions/mean_terminated_length": 282.5, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9739060618225612, "frac_reward_zero_std": 1.0, "grad_norm": 0.012738757290985215, "kl": 0.000690460205078125, "learning_rate": 1.0192304458849906e-07, "loss": 0.0, "num_tokens": 12936160.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 345.0, "completions/max_terminated_length": 345.0, "completions/mean_length": 279.875, "completions/mean_terminated_length": 279.875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9741736919577144, "frac_reward_zero_std": 1.0, "grad_norm": 0.017045116089212154, "kl": 0.001461029052734375, "learning_rate": 1.0188441841206199e-07, "loss": 0.0001, "num_tokens": 12939407.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 442.75, "completions/mean_terminated_length": 442.75, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.9744413220928677, "frac_reward_zero_std": 1.0, "grad_norm": 0.008593060135826976, "kl": 0.0008563995361328125, "learning_rate": 1.0184618328898723e-07, "loss": 0.0, "num_tokens": 12944277.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 790.0, "completions/max_terminated_length": 790.0, "completions/mean_length": 434.5, "completions/mean_terminated_length": 434.5, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.9747089522280209, "frac_reward_zero_std": 1.0, "grad_norm": 0.03652200105852624, "kl": 0.00211334228515625, "learning_rate": 1.0180833925264123e-07, "loss": 0.0001, "num_tokens": 12948785.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 377.375, "completions/mean_terminated_length": 377.375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9749765823631741, "frac_reward_zero_std": 1.0, "grad_norm": 0.014718640372339175, "kl": 0.0007686614990234375, "learning_rate": 1.0177088633604885e-07, "loss": 0.0, "num_tokens": 12952816.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 342.0, "completions/max_terminated_length": 342.0, "completions/mean_length": 275.375, "completions/mean_terminated_length": 275.375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.9752442124983273, "frac_reward_zero_std": 1.0, "grad_norm": 0.011430369351299728, "kl": 0.0010013580322265625, "learning_rate": 1.0173382457189393e-07, "loss": 0.0, "num_tokens": 12956027.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 521.0, "completions/max_terminated_length": 521.0, "completions/mean_length": 350.75, "completions/mean_terminated_length": 350.75, "completions/min_length": 268.0, "completions/min_terminated_length": 268.0, "epoch": 0.9755118426334806, "frac_reward_zero_std": 1.0, "grad_norm": 0.016486450687959047, "kl": 0.00152587890625, "learning_rate": 1.0169715399251886e-07, "loss": 0.0001, "num_tokens": 12959925.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 395.0, "completions/max_terminated_length": 395.0, "completions/mean_length": 263.0, "completions/mean_terminated_length": 263.0, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.9757794727686337, "frac_reward_zero_std": 1.0, "grad_norm": 0.018347980305577775, "kl": 0.0018157958984375, "learning_rate": 1.0166087462992456e-07, "loss": 0.0001, "num_tokens": 12963161.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 267.25, "completions/mean_terminated_length": 267.25, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9760471029037869, "frac_reward_zero_std": 1.0, "grad_norm": 0.019051053047083696, "kl": 0.0021209716796875, "learning_rate": 1.0162498651577076e-07, "loss": 0.0001, "num_tokens": 12966319.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 414.125, "completions/mean_terminated_length": 414.125, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 0.9763147330389402, "frac_reward_zero_std": 0.5, "grad_norm": 0.9813968562808921, "kl": 0.001270294189453125, "learning_rate": 1.0158948968137563e-07, "loss": -0.0038, "num_tokens": 12970932.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 353.625, "completions/mean_terminated_length": 353.625, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9765823631740934, "frac_reward_zero_std": 1.0, "grad_norm": 0.017872582221912042, "kl": 0.001495361328125, "learning_rate": 1.0155438415771583e-07, "loss": 0.0001, "num_tokens": 12974837.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 397.0, "completions/max_terminated_length": 397.0, "completions/mean_length": 269.125, "completions/mean_terminated_length": 269.125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.9768499933092466, "frac_reward_zero_std": 1.0, "grad_norm": 0.03080170189132575, "kl": 0.00176239013671875, "learning_rate": 1.015196699754267e-07, "loss": 0.0001, "num_tokens": 12977858.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 275.125, "completions/mean_terminated_length": 275.125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.9771176234443998, "frac_reward_zero_std": 1.0, "grad_norm": 0.019370356786792293, "kl": 0.001506805419921875, "learning_rate": 1.0148534716480189e-07, "loss": 0.0001, "num_tokens": 12981051.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 286.0, "completions/max_terminated_length": 286.0, "completions/mean_length": 248.625, "completions/mean_terminated_length": 248.625, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.9773852535795531, "frac_reward_zero_std": 1.0, "grad_norm": 0.015916271157556283, "kl": 0.001331329345703125, "learning_rate": 1.0145141575579366e-07, "loss": 0.0001, "num_tokens": 12984068.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 390.0, "completions/max_terminated_length": 390.0, "completions/mean_length": 225.375, "completions/mean_terminated_length": 225.375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.9776528837147063, "frac_reward_zero_std": 1.0, "grad_norm": 0.030344569825129866, "kl": 0.00176239013671875, "learning_rate": 1.0141787577801265e-07, "loss": 0.0001, "num_tokens": 12986771.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 442.625, "completions/mean_terminated_length": 442.625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.9779205138498595, "frac_reward_zero_std": 1.0, "grad_norm": 0.021694199229459706, "kl": 0.0018310546875, "learning_rate": 1.0138472726072792e-07, "loss": 0.0001, "num_tokens": 12991580.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 417.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 245.75, "completions/mean_terminated_length": 245.75, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.9781881439850127, "frac_reward_zero_std": 1.0, "grad_norm": 0.025660450337759207, "kl": 0.00215911865234375, "learning_rate": 1.013519702328669e-07, "loss": 0.0001, "num_tokens": 12994458.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 320.25, "completions/mean_terminated_length": 320.25, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.9784557741201659, "frac_reward_zero_std": 0.5, "grad_norm": 0.9321651582987074, "kl": 0.0018157958984375, "learning_rate": 1.0131960472301545e-07, "loss": 0.1055, "num_tokens": 12998116.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 650.0, "completions/max_terminated_length": 650.0, "completions/mean_length": 416.375, "completions/mean_terminated_length": 416.375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9787234042553191, "frac_reward_zero_std": 0.5, "grad_norm": 0.8715875942122745, "kl": 0.00142669677734375, "learning_rate": 1.0128763075941764e-07, "loss": 0.0647, "num_tokens": 13002587.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 329.0, "completions/max_terminated_length": 329.0, "completions/mean_length": 273.625, "completions/mean_terminated_length": 273.625, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.9789910343904724, "frac_reward_zero_std": 1.0, "grad_norm": 0.013351811501378279, "kl": 0.0009450912475585938, "learning_rate": 1.0125604836997596e-07, "loss": 0.0, "num_tokens": 13005748.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 182.0, "completions/mean_terminated_length": 182.0, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.9792586645256256, "frac_reward_zero_std": 1.0, "grad_norm": 0.013376224698521148, "kl": 0.000850677490234375, "learning_rate": 1.012248575822512e-07, "loss": 0.0, "num_tokens": 13008068.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 524.0, "completions/max_terminated_length": 524.0, "completions/mean_length": 360.0, "completions/mean_terminated_length": 360.0, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.9795262946607788, "frac_reward_zero_std": 0.5, "grad_norm": 0.5751078915619177, "kl": 0.002597808837890625, "learning_rate": 1.0119405842346225e-07, "loss": 0.0412, "num_tokens": 13012216.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 513.0, "completions/max_terminated_length": 513.0, "completions/mean_length": 372.75, "completions/mean_terminated_length": 372.75, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.979793924795932, "frac_reward_zero_std": 1.0, "grad_norm": 0.046176996791963874, "kl": 0.002471923828125, "learning_rate": 1.0116365092048649e-07, "loss": 0.0001, "num_tokens": 13016358.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 480.0, "completions/max_terminated_length": 480.0, "completions/mean_length": 356.75, "completions/mean_terminated_length": 356.75, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.9800615549310853, "frac_reward_zero_std": 0.5, "grad_norm": 1.214886851589585, "kl": 0.001819610595703125, "learning_rate": 1.0113363509985933e-07, "loss": 0.0161, "num_tokens": 13020332.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 249.0, "completions/mean_terminated_length": 249.0, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.9803291850662385, "frac_reward_zero_std": 1.0, "grad_norm": 0.07274692090737685, "kl": 0.002201080322265625, "learning_rate": 1.0110401098777443e-07, "loss": 0.0001, "num_tokens": 13023580.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.0, "completions/max_terminated_length": 500.0, "completions/mean_length": 352.875, "completions/mean_terminated_length": 352.875, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.9805968152013916, "frac_reward_zero_std": 1.0, "grad_norm": 0.019441819002416205, "kl": 0.00118255615234375, "learning_rate": 1.0107477861008365e-07, "loss": 0.0, "num_tokens": 13027519.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 387.0, "completions/mean_terminated_length": 387.0, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "epoch": 0.9808644453365449, "frac_reward_zero_std": 1.0, "grad_norm": 0.014295755634559613, "kl": 0.00121307373046875, "learning_rate": 1.0104593799229694e-07, "loss": 0.0, "num_tokens": 13031731.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 252.0, "completions/max_terminated_length": 252.0, "completions/mean_length": 218.0, "completions/mean_terminated_length": 218.0, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "epoch": 0.9811320754716981, "frac_reward_zero_std": 1.0, "grad_norm": 0.027780668605741823, "kl": 0.001926422119140625, "learning_rate": 1.0101748915958241e-07, "loss": 0.0001, "num_tokens": 13034415.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 297.375, "completions/mean_terminated_length": 297.375, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.9813997056068513, "frac_reward_zero_std": 1.0, "grad_norm": 0.009932436057602087, "kl": 0.00079345703125, "learning_rate": 1.0098943213676621e-07, "loss": 0.0, "num_tokens": 13037718.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 363.0, "completions/max_terminated_length": 363.0, "completions/mean_length": 287.5, "completions/mean_terminated_length": 287.5, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.9816673357420046, "frac_reward_zero_std": 1.0, "grad_norm": 0.041099214152974246, "kl": 0.00345611572265625, "learning_rate": 1.0096176694833279e-07, "loss": 0.0001, "num_tokens": 13041026.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 274.0, "completions/max_terminated_length": 274.0, "completions/mean_length": 212.875, "completions/mean_terminated_length": 212.875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.9819349658771578, "frac_reward_zero_std": 1.0, "grad_norm": 0.02371310150591451, "kl": 0.0044708251953125, "learning_rate": 1.0093449361842436e-07, "loss": 0.0002, "num_tokens": 13043641.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 360.0, "completions/max_terminated_length": 360.0, "completions/mean_length": 282.25, "completions/mean_terminated_length": 282.25, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.982202596012311, "frac_reward_zero_std": 1.0, "grad_norm": 0.023432860023107936, "kl": 0.000873565673828125, "learning_rate": 1.0090761217084137e-07, "loss": 0.0, "num_tokens": 13046983.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 425.0, "completions/max_terminated_length": 425.0, "completions/mean_length": 315.875, "completions/mean_terminated_length": 315.875, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.9824702261474642, "frac_reward_zero_std": 0.5, "grad_norm": 0.6540923069973754, "kl": 0.00125885009765625, "learning_rate": 1.0088112262904225e-07, "loss": 0.0164, "num_tokens": 13050486.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 458.0, "completions/mean_terminated_length": 458.0, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.9827378562826175, "frac_reward_zero_std": 1.0, "grad_norm": 0.00709672224917763, "kl": 0.0005245208740234375, "learning_rate": 1.0085502501614336e-07, "loss": 0.0, "num_tokens": 13055178.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 336.0, "completions/max_terminated_length": 336.0, "completions/mean_length": 287.75, "completions/mean_terminated_length": 287.75, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.9830054864177706, "frac_reward_zero_std": 1.0, "grad_norm": 0.02234980795170212, "kl": 0.00173187255859375, "learning_rate": 1.0082931935491914e-07, "loss": 0.0001, "num_tokens": 13058560.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 430.125, "completions/mean_terminated_length": 430.125, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.9832731165529238, "frac_reward_zero_std": 1.0, "grad_norm": 0.014753040628951343, "kl": 0.00124359130859375, "learning_rate": 1.0080400566780195e-07, "loss": 0.0, "num_tokens": 13063145.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 427.0, "completions/max_terminated_length": 427.0, "completions/mean_length": 306.375, "completions/mean_terminated_length": 306.375, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.9835407466880771, "frac_reward_zero_std": 1.0, "grad_norm": 0.040830965779854525, "kl": 0.0019989013671875, "learning_rate": 1.00779083976882e-07, "loss": 0.0001, "num_tokens": 13066984.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 625.0, "completions/max_terminated_length": 625.0, "completions/mean_length": 444.625, "completions/mean_terminated_length": 444.625, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9838083768232303, "frac_reward_zero_std": 1.0, "grad_norm": 0.09468272722037516, "kl": 0.003322601318359375, "learning_rate": 1.0075455430390757e-07, "loss": 0.0001, "num_tokens": 13071737.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 536.0, "completions/max_terminated_length": 536.0, "completions/mean_length": 364.0, "completions/mean_terminated_length": 364.0, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9840760069583835, "frac_reward_zero_std": 1.0, "grad_norm": 0.013951629649132588, "kl": 0.001224517822265625, "learning_rate": 1.0073041667028472e-07, "loss": 0.0, "num_tokens": 13075673.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 352.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 296.75, "completions/mean_terminated_length": 296.75, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.9843436370935368, "frac_reward_zero_std": 0.5, "grad_norm": 0.6663863006355586, "kl": 0.00156402587890625, "learning_rate": 1.007066710970775e-07, "loss": -0.0195, "num_tokens": 13079099.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 471.0, "completions/max_terminated_length": 471.0, "completions/mean_length": 337.375, "completions/mean_terminated_length": 337.375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.98461126722869, "frac_reward_zero_std": 1.0, "grad_norm": 0.0644390844086002, "kl": 0.00203704833984375, "learning_rate": 1.0068331760500772e-07, "loss": 0.0001, "num_tokens": 13082970.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 319.0, "completions/max_terminated_length": 319.0, "completions/mean_length": 280.125, "completions/mean_terminated_length": 280.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9848788973638432, "frac_reward_zero_std": 1.0, "grad_norm": 0.020004237228377224, "kl": 0.0014286041259765625, "learning_rate": 1.0066035621445507e-07, "loss": 0.0001, "num_tokens": 13086271.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 257.5, "completions/mean_terminated_length": 257.5, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "epoch": 0.9851465274989963, "frac_reward_zero_std": 1.0, "grad_norm": 0.040148389119040784, "kl": 0.001800537109375, "learning_rate": 1.0063778694545713e-07, "loss": 0.0001, "num_tokens": 13089363.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 274.625, "completions/mean_terminated_length": 274.625, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.9854141576341496, "frac_reward_zero_std": 1.0, "grad_norm": 0.018519838767413314, "kl": 0.001430511474609375, "learning_rate": 1.0061560981770921e-07, "loss": 0.0001, "num_tokens": 13092504.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 288.875, "completions/mean_terminated_length": 288.875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.9856817877693028, "frac_reward_zero_std": 0.5, "grad_norm": 0.8646066128031658, "kl": 0.00191497802734375, "learning_rate": 1.0059382485056444e-07, "loss": -0.0632, "num_tokens": 13095807.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.985949417904456, "frac_reward_zero_std": 1.0, "grad_norm": 0.015403080059939196, "kl": 0.000766754150390625, "learning_rate": 1.0057243206303376e-07, "loss": 0.0, "num_tokens": 13099053.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 258.625, "completions/mean_terminated_length": 258.625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.9862170480396093, "frac_reward_zero_std": 1.0, "grad_norm": 0.009437814387424559, "kl": 0.0005779266357421875, "learning_rate": 1.0055143147378576e-07, "loss": 0.0, "num_tokens": 13102238.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 386.375, "completions/mean_terminated_length": 386.375, "completions/min_length": 279.0, "completions/min_terminated_length": 279.0, "epoch": 0.9864846781747625, "frac_reward_zero_std": 1.0, "grad_norm": 0.014009309086545947, "kl": 0.00141143798828125, "learning_rate": 1.0053082310114691e-07, "loss": 0.0001, "num_tokens": 13106605.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 444.0, "completions/max_terminated_length": 444.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.9867523083099157, "frac_reward_zero_std": 1.0, "grad_norm": 0.011819652914487435, "kl": 0.001125335693359375, "learning_rate": 1.0051060696310138e-07, "loss": 0.0, "num_tokens": 13109978.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 282.0, "completions/max_terminated_length": 282.0, "completions/mean_length": 243.375, "completions/mean_terminated_length": 243.375, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.987019938445069, "frac_reward_zero_std": 1.0, "grad_norm": 0.017175844120226883, "kl": 0.001201629638671875, "learning_rate": 1.0049078307729103e-07, "loss": 0.0, "num_tokens": 13112877.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 315.0, "completions/max_terminated_length": 315.0, "completions/mean_length": 262.5, "completions/mean_terminated_length": 262.5, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.9872875685802222, "frac_reward_zero_std": 1.0, "grad_norm": 0.015929265492965836, "kl": 0.000911712646484375, "learning_rate": 1.0047135146101527e-07, "loss": 0.0, "num_tokens": 13115889.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 294.0, "completions/max_terminated_length": 294.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9875551987153753, "frac_reward_zero_std": 1.0, "grad_norm": 0.012303453232097646, "kl": 0.0007038116455078125, "learning_rate": 1.0045231213123147e-07, "loss": 0.0, "num_tokens": 13118558.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 422.0, "completions/max_terminated_length": 422.0, "completions/mean_length": 305.125, "completions/mean_terminated_length": 305.125, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.9878228288505285, "frac_reward_zero_std": 1.0, "grad_norm": 0.01004052306552324, "kl": 0.00067138671875, "learning_rate": 1.0043366510455448e-07, "loss": 0.0, "num_tokens": 13121951.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 514.0, "completions/max_terminated_length": 514.0, "completions/mean_length": 377.25, "completions/mean_terminated_length": 377.25, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.9880904589856818, "frac_reward_zero_std": 0.0, "grad_norm": 1.228550774423236, "kl": 0.0030059814453125, "learning_rate": 1.0041541039725685e-07, "loss": -0.0305, "num_tokens": 13126525.0, "reward": 0.625, "reward_std": 0.5386751294136047, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 306.75, "completions/mean_terminated_length": 306.75, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.988358089120835, "frac_reward_zero_std": 1.0, "grad_norm": 0.01275373121844254, "kl": 0.001361846923828125, "learning_rate": 1.0039754802526882e-07, "loss": 0.0001, "num_tokens": 13130055.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 314.0, "completions/max_terminated_length": 314.0, "completions/mean_length": 267.875, "completions/mean_terminated_length": 267.875, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.9886257192559882, "frac_reward_zero_std": 1.0, "grad_norm": 0.012004142194558183, "kl": 0.0009632110595703125, "learning_rate": 1.0038007800417805e-07, "loss": 0.0, "num_tokens": 13133174.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 399.0, "completions/max_terminated_length": 399.0, "completions/mean_length": 341.625, "completions/mean_terminated_length": 341.625, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.9888933493911415, "frac_reward_zero_std": 1.0, "grad_norm": 0.01630576983067528, "kl": 0.001529693603515625, "learning_rate": 1.0036300034923012e-07, "loss": 0.0001, "num_tokens": 13136971.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 448.0, "completions/max_terminated_length": 448.0, "completions/mean_length": 312.0, "completions/mean_terminated_length": 312.0, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.9891609795262947, "frac_reward_zero_std": 1.0, "grad_norm": 0.013071725456709468, "kl": 0.00102996826171875, "learning_rate": 1.00346315075328e-07, "loss": 0.0, "num_tokens": 13140599.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 421.0, "completions/max_terminated_length": 421.0, "completions/mean_length": 304.75, "completions/mean_terminated_length": 304.75, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.9894286096614479, "frac_reward_zero_std": 1.0, "grad_norm": 0.013357003770743262, "kl": 0.001247406005859375, "learning_rate": 1.0033002219703222e-07, "loss": 0.0, "num_tokens": 13144173.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 292.0, "completions/mean_terminated_length": 292.0, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.9896962397966012, "frac_reward_zero_std": 1.0, "grad_norm": 0.019638292026083245, "kl": 0.001178741455078125, "learning_rate": 1.0031412172856105e-07, "loss": 0.0, "num_tokens": 13148077.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 327.5, "completions/mean_terminated_length": 327.5, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9899638699317543, "frac_reward_zero_std": 1.0, "grad_norm": 0.009470795757976502, "kl": 0.001041412353515625, "learning_rate": 1.0029861368379019e-07, "loss": 0.0, "num_tokens": 13151633.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 434.0, "completions/max_terminated_length": 434.0, "completions/mean_length": 277.875, "completions/mean_terminated_length": 277.875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.9902315000669075, "frac_reward_zero_std": 1.0, "grad_norm": 0.03616720846476464, "kl": 0.001323699951171875, "learning_rate": 1.002834980762529e-07, "loss": 0.0001, "num_tokens": 13154804.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 300.0, "completions/max_terminated_length": 300.0, "completions/mean_length": 221.75, "completions/mean_terminated_length": 221.75, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.9904991302020607, "frac_reward_zero_std": 1.0, "grad_norm": 0.1496566396372997, "kl": 0.005889892578125, "learning_rate": 1.0026877491913996e-07, "loss": 0.0002, "num_tokens": 13157850.0, "reward": 0.5, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5345224738121033, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 386.5, "completions/mean_terminated_length": 386.5, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.990766760337214, "frac_reward_zero_std": 0.5, "grad_norm": 0.8793274032710979, "kl": 0.00142669677734375, "learning_rate": 1.0025444422529982e-07, "loss": 0.0283, "num_tokens": 13162014.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 325.0, "completions/max_terminated_length": 325.0, "completions/mean_length": 243.75, "completions/mean_terminated_length": 243.75, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.9910343904723672, "frac_reward_zero_std": 1.0, "grad_norm": 0.008286421862520023, "kl": 0.0003871917724609375, "learning_rate": 1.0024050600723824e-07, "loss": 0.0, "num_tokens": 13164900.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 538.0, "completions/max_terminated_length": 538.0, "completions/mean_length": 338.75, "completions/mean_terminated_length": 338.75, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.9913020206075204, "frac_reward_zero_std": 1.0, "grad_norm": 0.01930718076571679, "kl": 0.001495361328125, "learning_rate": 1.0022696027711854e-07, "loss": 0.0001, "num_tokens": 13168642.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 469.0, "completions/max_terminated_length": 469.0, "completions/mean_length": 269.5, "completions/mean_terminated_length": 269.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9915696507426737, "frac_reward_zero_std": 1.0, "grad_norm": 0.018283811891893865, "kl": 0.0011749267578125, "learning_rate": 1.0021380704676166e-07, "loss": 0.0, "num_tokens": 13171810.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 281.0, "completions/max_terminated_length": 281.0, "completions/mean_length": 253.375, "completions/mean_terminated_length": 253.375, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.9918372808778269, "frac_reward_zero_std": 1.0, "grad_norm": 0.014131879788837497, "kl": 0.0010852813720703125, "learning_rate": 1.0020104632764589e-07, "loss": 0.0, "num_tokens": 13174801.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 517.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 326.0, "completions/mean_terminated_length": 326.0, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.99210491101298, "frac_reward_zero_std": 1.0, "grad_norm": 0.023623758267010957, "kl": 0.0016937255859375, "learning_rate": 1.0018867813090696e-07, "loss": 0.0001, "num_tokens": 13178385.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 238.125, "completions/mean_terminated_length": 238.125, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.9923725411481332, "frac_reward_zero_std": 1.0, "grad_norm": 0.01576801666362656, "kl": 0.0013885498046875, "learning_rate": 1.001767024673382e-07, "loss": 0.0001, "num_tokens": 13181314.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 271.875, "completions/mean_terminated_length": 271.875, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "epoch": 0.9926401712832865, "frac_reward_zero_std": 0.5, "grad_norm": 0.8788399664380583, "kl": 0.0017242431640625, "learning_rate": 1.001651193473903e-07, "loss": 0.0051, "num_tokens": 13184761.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 410.0, "completions/max_terminated_length": 410.0, "completions/mean_length": 342.875, "completions/mean_terminated_length": 342.875, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.9929078014184397, "frac_reward_zero_std": 1.0, "grad_norm": 0.019677836533859648, "kl": 0.00171661376953125, "learning_rate": 1.0015392878117137e-07, "loss": 0.0001, "num_tokens": 13188656.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 307.0, "completions/max_terminated_length": 307.0, "completions/mean_length": 229.125, "completions/mean_terminated_length": 229.125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.9931754315535929, "frac_reward_zero_std": 1.0, "grad_norm": 0.0241970884810798, "kl": 0.001605987548828125, "learning_rate": 1.00143130778447e-07, "loss": 0.0001, "num_tokens": 13191561.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 490.0, "completions/max_terminated_length": 490.0, "completions/mean_length": 422.625, "completions/mean_terminated_length": 422.625, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 0.9934430616887462, "frac_reward_zero_std": 1.0, "grad_norm": 0.019781312746338525, "kl": 0.001613616943359375, "learning_rate": 1.0013272534864027e-07, "loss": 0.0001, "num_tokens": 13196166.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 327.0, "completions/max_terminated_length": 327.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.9937106918238994, "frac_reward_zero_std": 1.0, "grad_norm": 0.018965181253634812, "kl": 0.001514434814453125, "learning_rate": 1.0012271250083149e-07, "loss": 0.0001, "num_tokens": 13199367.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 339.875, "completions/mean_terminated_length": 339.875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.9939783219590526, "frac_reward_zero_std": 1.0, "grad_norm": 0.01006411339912601, "kl": 0.001186370849609375, "learning_rate": 1.0011309224375853e-07, "loss": 0.0, "num_tokens": 13203006.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 409.0, "completions/max_terminated_length": 409.0, "completions/mean_length": 330.375, "completions/mean_terminated_length": 330.375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.9942459520942059, "frac_reward_zero_std": 1.0, "grad_norm": 0.026121043013427693, "kl": 0.00249481201171875, "learning_rate": 1.0010386458581667e-07, "loss": 0.0001, "num_tokens": 13206769.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 412.125, "completions/mean_terminated_length": 412.125, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.994513582229359, "frac_reward_zero_std": 0.5, "grad_norm": 1.0001016383863424, "kl": 0.003875732421875, "learning_rate": 1.0009502953505843e-07, "loss": -0.0014, "num_tokens": 13211326.0, "reward": 0.625, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.5175492167472839, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 414.0, "completions/max_terminated_length": 414.0, "completions/mean_length": 328.5, "completions/mean_terminated_length": 328.5, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.9947812123645122, "frac_reward_zero_std": 1.0, "grad_norm": 0.014645844379138257, "kl": 0.00146484375, "learning_rate": 1.0008658709919394e-07, "loss": 0.0001, "num_tokens": 13215030.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 298.0, "completions/max_terminated_length": 298.0, "completions/mean_length": 252.125, "completions/mean_terminated_length": 252.125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.9950488424996654, "frac_reward_zero_std": 1.0, "grad_norm": 0.01715383773240175, "kl": 0.001270294189453125, "learning_rate": 1.0007853728559053e-07, "loss": 0.0001, "num_tokens": 13217979.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 290.375, "completions/mean_terminated_length": 290.375, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.9953164726348187, "frac_reward_zero_std": 1.0, "grad_norm": 0.0254300694181836, "kl": 0.002285003662109375, "learning_rate": 1.0007088010127298e-07, "loss": 0.0001, "num_tokens": 13221366.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 423.0, "completions/max_terminated_length": 423.0, "completions/mean_length": 337.5, "completions/mean_terminated_length": 337.5, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.9955841027699719, "frac_reward_zero_std": 0.5, "grad_norm": 0.8213854015054332, "kl": 0.0087890625, "learning_rate": 1.0006361555292336e-07, "loss": -0.0829, "num_tokens": 13225434.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 385.5, "completions/mean_terminated_length": 385.5, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.9958517329051251, "frac_reward_zero_std": 1.0, "grad_norm": 0.01947038004780938, "kl": 0.001575469970703125, "learning_rate": 1.0005674364688125e-07, "loss": 0.0001, "num_tokens": 13229778.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 327.125, "completions/mean_terminated_length": 327.125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.9961193630402784, "frac_reward_zero_std": 0.5, "grad_norm": 0.8477382355987346, "kl": 0.0052642822265625, "learning_rate": 1.0005026438914351e-07, "loss": -0.0901, "num_tokens": 13233571.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 437.0, "completions/max_terminated_length": 437.0, "completions/mean_length": 359.0, "completions/mean_terminated_length": 359.0, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.9963869931754316, "frac_reward_zero_std": 1.0, "grad_norm": 0.01403111646924382, "kl": 0.00127410888671875, "learning_rate": 1.0004417778536422e-07, "loss": 0.0001, "num_tokens": 13237715.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 384.0, "completions/max_terminated_length": 384.0, "completions/mean_length": 286.625, "completions/mean_terminated_length": 286.625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.9966546233105847, "frac_reward_zero_std": 1.0, "grad_norm": 0.016114966097272747, "kl": 0.0008544921875, "learning_rate": 1.0003848384085509e-07, "loss": 0.0, "num_tokens": 13241236.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 479.0, "completions/max_terminated_length": 479.0, "completions/mean_length": 332.0, "completions/mean_terminated_length": 332.0, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "epoch": 0.996922253445738, "frac_reward_zero_std": 1.0, "grad_norm": 0.012140953701234705, "kl": 0.0011997222900390625, "learning_rate": 1.0003318256058486e-07, "loss": 0.0, "num_tokens": 13244972.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 383.625, "completions/mean_terminated_length": 383.625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.9971898835808912, "frac_reward_zero_std": 0.5, "grad_norm": 0.815169837270602, "kl": 0.002197265625, "learning_rate": 1.0002827394917986e-07, "loss": -0.0293, "num_tokens": 13249277.0, "reward": 0.75, "reward_std": 0.28867512941360474, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4629100561141968, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 408.0, "completions/max_terminated_length": 408.0, "completions/mean_length": 304.875, "completions/mean_terminated_length": 304.875, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.9974575137160444, "frac_reward_zero_std": 1.0, "grad_norm": 0.024230095657363348, "kl": 0.0012149810791015625, "learning_rate": 1.0002375801092358e-07, "loss": 0.0, "num_tokens": 13252752.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 310.0, "completions/mean_terminated_length": 310.0, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "epoch": 0.9977251438511976, "frac_reward_zero_std": 0.5, "grad_norm": 0.6588765482022654, "kl": 0.0006561279296875, "learning_rate": 1.0001963474975698e-07, "loss": 0.0216, "num_tokens": 13256312.0, "reward": 0.875, "reward_std": 0.25, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.3535533845424652, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 394.125, "completions/mean_terminated_length": 394.125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.9979927739863509, "frac_reward_zero_std": 1.0, "grad_norm": 0.0124533488431293, "kl": 0.0013580322265625, "learning_rate": 1.0001590416927821e-07, "loss": 0.0001, "num_tokens": 13260733.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 379.0, "completions/max_terminated_length": 379.0, "completions/mean_length": 281.75, "completions/mean_terminated_length": 281.75, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.9982604041215041, "frac_reward_zero_std": 1.0, "grad_norm": 0.012493527723628113, "kl": 0.00128936767578125, "learning_rate": 1.0001256627274278e-07, "loss": 0.0001, "num_tokens": 13263975.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 348.0, "completions/max_terminated_length": 348.0, "completions/mean_length": 273.75, "completions/mean_terminated_length": 273.75, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9985280342566573, "frac_reward_zero_std": 1.0, "grad_norm": 0.017303680421583274, "kl": 0.001567840576171875, "learning_rate": 1.0000962106306358e-07, "loss": 0.0001, "num_tokens": 13267373.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 322.0, "completions/max_terminated_length": 322.0, "completions/mean_length": 267.5, "completions/mean_terminated_length": 267.5, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.9987956643918106, "frac_reward_zero_std": 1.0, "grad_norm": 0.02480907681994327, "kl": 0.001789093017578125, "learning_rate": 1.0000706854281089e-07, "loss": 0.0001, "num_tokens": 13270577.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 230.0, "completions/mean_terminated_length": 230.0, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9990632945269637, "frac_reward_zero_std": 1.0, "grad_norm": 0.017107131523161994, "kl": 0.0012416839599609375, "learning_rate": 1.0000490871421201e-07, "loss": 0.0, "num_tokens": 13273381.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 454.0, "completions/max_terminated_length": 454.0, "completions/mean_length": 312.75, "completions/mean_terminated_length": 312.75, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.9993309246621169, "frac_reward_zero_std": 1.0, "grad_norm": 0.01602074467703852, "kl": 0.001434326171875, "learning_rate": 1.0000314157915182e-07, "loss": 0.0001, "num_tokens": 13277115.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 229.5, "completions/mean_terminated_length": 229.5, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.9995985547972702, "frac_reward_zero_std": 1.0, "grad_norm": 0.0131153545536918, "kl": 0.0008525848388671875, "learning_rate": 1.0000176713917249e-07, "loss": 0.0, "num_tokens": 13279887.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 454.5, "completions/mean_terminated_length": 454.5, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.9998661849324234, "frac_reward_zero_std": 1.0, "grad_norm": 0.015055653539006935, "kl": 0.00090789794921875, "learning_rate": 1.0000078539547335e-07, "loss": 0.0, "num_tokens": 13284615.0, "reward": 1.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "step": 3736 }, { "epoch": 0.9998661849324234, "step": 3736, "total_flos": 0.0, "train_loss": 0.0019137199302173646, "train_runtime": 25690.8059, "train_samples_per_second": 0.291, "train_steps_per_second": 0.145 } ], "logging_steps": 1, "max_steps": 3737, "num_input_tokens_seen": 13284615, "num_train_epochs": 1, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }