diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,15043 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 35.689655172413794, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3608.0, + "completions/max_terminated_length": 3008.0, + "completions/mean_length": 836.2421875, + "completions/mean_terminated_length": 763.040283203125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 0.06896551724137931, + "frac_reward_zero_std": 0.875, + "grad_norm": 7.329275313378857, + "kl": 0.0007262229919433594, + "learning_rate": 0.0, + "loss": -0.0208, + "num_tokens": 237015.0, + "reward": 0.0023437500931322575, + "reward_std": 0.00662912568077445, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 1679.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 738.0234375, + "completions/mean_terminated_length": 690.4166870117188, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 0.13793103448275862, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.03884243258483543, + "kl": 0.0013523101806640625, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.0, + "num_tokens": 462554.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.65625, + "completions/max_length": 3194.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 859.375, + "completions/mean_terminated_length": 734.9915161132812, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.20689655172413793, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2713581262227546, + "kl": 0.0028324127197265625, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0134, + "num_tokens": 703626.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 3311.0, + "completions/max_terminated_length": 2429.0, + "completions/mean_length": 829.578125, + "completions/mean_terminated_length": 674.7344970703125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 0.27586206896551724, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.29708280749661614, + "kl": 0.0035877227783203125, + "learning_rate": 1.0000000000000002e-06, + "loss": -0.0728, + "num_tokens": 940884.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 2788.0, + "completions/max_terminated_length": 2788.0, + "completions/mean_length": 816.3515625, + "completions/mean_terminated_length": 765.6854858398438, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.3448275862068966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.06949683850604436, + "kl": 0.001728057861328125, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0, + "num_tokens": 1176009.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 3751.0, + "completions/max_terminated_length": 2844.0, + "completions/mean_length": 866.28125, + "completions/mean_terminated_length": 833.5556030273438, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 0.41379310344827586, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.1143311609167342, + "kl": 0.0008478164672851562, + "learning_rate": 1.6666666666666667e-06, + "loss": -0.0167, + "num_tokens": 1415893.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 3055.0, + "completions/max_terminated_length": 3055.0, + "completions/mean_length": 805.8359375, + "completions/mean_terminated_length": 770.1349487304688, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 0.4827586206896552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.07492278282994858, + "kl": 0.0012269020080566406, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0, + "num_tokens": 1650112.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3898.0, + "completions/max_terminated_length": 2050.0, + "completions/mean_length": 723.9296875, + "completions/mean_terminated_length": 688.2177124023438, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.5517241379310345, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04363979198005147, + "kl": 0.0010972023010253906, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.0, + "num_tokens": 1872679.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3639.0, + "completions/max_terminated_length": 2242.0, + "completions/mean_length": 739.3828125, + "completions/mean_terminated_length": 699.8225708007812, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.6206896551724138, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.11751018346403032, + "kl": 0.0007791519165039062, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0139, + "num_tokens": 2098392.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2701.0, + "completions/max_terminated_length": 2701.0, + "completions/mean_length": 843.828125, + "completions/mean_terminated_length": 818.0400390625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 0.6896551724137931, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.282798190145457, + "kl": 0.009571075439453125, + "learning_rate": 3e-06, + "loss": 0.0048, + "num_tokens": 2337474.0, + "reward": 0.0062500000931322575, + "reward_std": 0.01462521031498909, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 3478.0, + "completions/max_terminated_length": 2911.0, + "completions/mean_length": 788.875, + "completions/mean_terminated_length": 751.2222900390625, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 0.7586206896551724, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.2217463855001291, + "kl": 0.0009412765502929688, + "learning_rate": 3.3333333333333333e-06, + "loss": -0.0439, + "num_tokens": 2568146.0, + "reward": 0.0062500000931322575, + "reward_std": 0.01462521031498909, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.84375, + "completions/max_length": 3367.0, + "completions/max_terminated_length": 3367.0, + "completions/mean_length": 872.359375, + "completions/mean_terminated_length": 821.3901977539062, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 0.8275862068965517, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.9919377838221575, + "kl": 0.0614471435546875, + "learning_rate": 3.6666666666666666e-06, + "loss": -0.0038, + "num_tokens": 2810880.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 3649.0, + "completions/max_terminated_length": 3049.0, + "completions/mean_length": 799.0390625, + "completions/mean_terminated_length": 745.2000122070312, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 0.896551724137931, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8404371378196254, + "kl": 0.021940231323242188, + "learning_rate": 4.000000000000001e-06, + "loss": -0.0194, + "num_tokens": 3044229.0, + "reward": 0.004687500186264515, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 3196.0, + "completions/max_terminated_length": 2418.0, + "completions/mean_length": 820.6953125, + "completions/mean_terminated_length": 801.9921264648438, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.9655172413793104, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.29176753147358486, + "kl": 0.004749298095703125, + "learning_rate": 4.333333333333334e-06, + "loss": -0.0113, + "num_tokens": 3280350.0, + "reward": 0.0078125, + "reward_std": 0.019044626504182816, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3712.0, + "completions/max_terminated_length": 2837.0, + "completions/mean_length": 984.875, + "completions/mean_terminated_length": 850.49169921875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 1.0689655172413792, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.6613227639845897, + "kl": 0.13519287109375, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0119, + "num_tokens": 3536582.0, + "reward": 0.02031249925494194, + "reward_std": 0.047098226845264435, + "rewards/code_format_reward/mean": 0.09375, + "rewards/code_format_reward/std": 0.29262590408325195, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31333550810813904, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 3213.0, + "completions/max_terminated_length": 3158.0, + "completions/mean_length": 1020.6484375, + "completions/mean_terminated_length": 842.5438842773438, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 1.1379310344827587, + "frac_reward_zero_std": 0.125, + "grad_norm": 43.83469868815612, + "kl": 1.2880859375, + "learning_rate": 5e-06, + "loss": 0.0709, + "num_tokens": 3798297.0, + "reward": 0.03125, + "reward_std": 0.06938961893320084, + "rewards/code_format_reward/mean": 0.15625, + "rewards/code_format_reward/std": 0.3645188808441162, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.3645188808441162, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3634.0, + "completions/max_terminated_length": 2728.0, + "completions/mean_length": 816.6875, + "completions/mean_terminated_length": 726.9193115234375, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 1.206896551724138, + "frac_reward_zero_std": 0.1875, + "grad_norm": 995.2557894224733, + "kl": 23.7371826171875, + "learning_rate": 4.999952797253148e-06, + "loss": 0.277, + "num_tokens": 4033905.0, + "reward": 0.03671874850988388, + "reward_std": 0.06861686706542969, + "rewards/code_format_reward/mean": 0.1796875, + "rewards/code_format_reward/std": 0.3854354918003082, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.39184603095054626, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 3677.0, + "completions/max_terminated_length": 3677.0, + "completions/mean_length": 828.2109375, + "completions/mean_terminated_length": 792.1817626953125, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.2758620689655173, + "frac_reward_zero_std": 0.125, + "grad_norm": 15620.298455105083, + "kl": 711.0061645507812, + "learning_rate": 4.9998111909931225e-06, + "loss": 7.2128, + "num_tokens": 4270988.0, + "reward": 0.05546875298023224, + "reward_std": 0.08068342506885529, + "rewards/code_format_reward/mean": 0.2734375, + "rewards/code_format_reward/std": 0.447474867105484, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4513758420944214, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 3792.0, + "completions/max_terminated_length": 3792.0, + "completions/mean_length": 876.3046875, + "completions/mean_terminated_length": 780.710693359375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 1.3448275862068966, + "frac_reward_zero_std": 0.0, + "grad_norm": 65.50568634933693, + "kl": 5.886474609375, + "learning_rate": 4.999575187161439e-06, + "loss": 0.1377, + "num_tokens": 4513995.0, + "reward": 0.07265625149011612, + "reward_std": 0.09746605902910233, + "rewards/code_format_reward/mean": 0.359375, + "rewards/code_format_reward/std": 0.481702595949173, + "rewards/format_reward/mean": 0.3671875, + "rewards/format_reward/std": 0.4839322865009308, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 2720.0, + "completions/max_terminated_length": 2720.0, + "completions/mean_length": 848.6328125, + "completions/mean_terminated_length": 777.4343872070312, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 1.4137931034482758, + "frac_reward_zero_std": 0.0, + "grad_norm": 151.71342486055147, + "kl": 5.810791015625, + "learning_rate": 4.9992447956603455e-06, + "loss": 0.1517, + "num_tokens": 4753692.0, + "reward": 0.10703124850988388, + "reward_std": 0.10140325874090195, + "rewards/code_format_reward/mean": 0.515625, + "rewards/code_format_reward/std": 0.5017194747924805, + "rewards/format_reward/mean": 0.5546875, + "rewards/format_reward/std": 0.4989531338214874, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2993.0, + "completions/max_terminated_length": 2561.0, + "completions/mean_length": 744.5625, + "completions/mean_terminated_length": 690.6400146484375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 1.4827586206896552, + "frac_reward_zero_std": 0.0, + "grad_norm": 1.4736174877248536, + "kl": 0.04266357421875, + "learning_rate": 4.998820030352409e-06, + "loss": 0.0836, + "num_tokens": 4978900.0, + "reward": 0.12890625, + "reward_std": 0.0890723466873169, + "rewards/code_format_reward/mean": 0.625, + "rewards/code_format_reward/std": 0.4860251843929291, + "rewards/format_reward/mean": 0.6640625, + "rewards/format_reward/std": 0.47417303919792175, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 1963.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 710.6171875, + "completions/mean_terminated_length": 666.0000610351562, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 1.5517241379310345, + "frac_reward_zero_std": 0.0625, + "grad_norm": 38.20579455574688, + "kl": 0.62890625, + "learning_rate": 4.998300909059929e-06, + "loss": 0.1571, + "num_tokens": 5200931.0, + "reward": 0.13750000298023224, + "reward_std": 0.08701484650373459, + "rewards/code_format_reward/mean": 0.671875, + "rewards/code_format_reward/std": 0.4713755249977112, + "rewards/format_reward/mean": 0.703125, + "rewards/format_reward/std": 0.45867621898651123, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 2847.0, + "completions/max_terminated_length": 2847.0, + "completions/mean_length": 722.390625, + "completions/mean_terminated_length": 711.5556030273438, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 1.6206896551724137, + "frac_reward_zero_std": 0.0625, + "grad_norm": 0.5678012619151958, + "kl": 0.09613037109375, + "learning_rate": 4.997687453564198e-06, + "loss": 0.0384, + "num_tokens": 5423325.0, + "reward": 0.15312500298023224, + "reward_std": 0.07640768587589264, + "rewards/code_format_reward/mean": 0.7265625, + "rewards/code_format_reward/std": 0.447474867105484, + "rewards/format_reward/mean": 0.8046875, + "rewards/format_reward/std": 0.3979988098144531, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 2973.0, + "completions/max_terminated_length": 1558.0, + "completions/mean_length": 717.1640625, + "completions/mean_terminated_length": 662.1156616210938, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 1.6896551724137931, + "frac_reward_zero_std": 0.1875, + "grad_norm": 195.2594908645977, + "kl": 7.546875, + "learning_rate": 4.9969796896045775e-06, + "loss": 0.234, + "num_tokens": 5645026.0, + "reward": 0.15937501192092896, + "reward_std": 0.06703707575798035, + "rewards/code_format_reward/mean": 0.765625, + "rewards/code_format_reward/std": 0.42527204751968384, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3787541687488556, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 3010.0, + "completions/max_terminated_length": 3010.0, + "completions/mean_length": 677.21875, + "completions/mean_terminated_length": 664.8897705078125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 1.7586206896551724, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.4865500559586326, + "kl": 0.05767822265625, + "learning_rate": 4.996177646877426e-06, + "loss": 0.0349, + "num_tokens": 5862782.0, + "reward": 0.17734375596046448, + "reward_std": 0.04920876771211624, + "rewards/code_format_reward/mean": 0.875, + "rewards/code_format_reward/std": 0.3320184051990509, + "rewards/format_reward/mean": 0.8984375, + "rewards/format_reward/std": 0.3032590448856354, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2240.0, + "completions/max_terminated_length": 2240.0, + "completions/mean_length": 691.984375, + "completions/mean_terminated_length": 663.6400146484375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 1.8275862068965516, + "frac_reward_zero_std": 0.25, + "grad_norm": 19.735744331019188, + "kl": 1.9664306640625, + "learning_rate": 4.995281359034851e-06, + "loss": 0.1256, + "num_tokens": 6081988.0, + "reward": 0.17656250298023224, + "reward_std": 0.04989224672317505, + "rewards/code_format_reward/mean": 0.859375, + "rewards/code_format_reward/std": 0.3490002751350403, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29262590408325195, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 2768.0, + "completions/max_terminated_length": 2768.0, + "completions/mean_length": 612.3046875, + "completions/mean_terminated_length": 612.3046875, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 1.896551724137931, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.4611562798663595, + "kl": 0.07080078125, + "learning_rate": 4.994290863683296e-06, + "loss": 0.0295, + "num_tokens": 6290339.0, + "reward": 0.18828123807907104, + "reward_std": 0.029640421271324158, + "rewards/code_format_reward/mean": 0.921875, + "rewards/code_format_reward/std": 0.2694226801395416, + "rewards/format_reward/mean": 0.9609375, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 2192.0, + "completions/max_terminated_length": 1643.0, + "completions/mean_length": 700.1875, + "completions/mean_terminated_length": 656.491943359375, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 1.9655172413793105, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1833.9655339013984, + "kl": 105.04736328125, + "learning_rate": 4.99320620238196e-06, + "loss": 1.1265, + "num_tokens": 6511035.0, + "reward": 0.17890626192092896, + "reward_std": 0.04762028157711029, + "rewards/code_format_reward/mean": 0.875, + "rewards/code_format_reward/std": 0.3320184051990509, + "rewards/format_reward/mean": 0.9140625, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 3102.0, + "completions/max_terminated_length": 3102.0, + "completions/mean_length": 683.8359375, + "completions/mean_terminated_length": 625.14404296875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 2.0689655172413794, + "frac_reward_zero_std": 0.5625, + "grad_norm": 6.485350364733725, + "kl": 0.551513671875, + "learning_rate": 4.99202742064106e-06, + "loss": 0.1122, + "num_tokens": 6729198.0, + "reward": 0.17421875894069672, + "reward_std": 0.03528411686420441, + "rewards/code_format_reward/mean": 0.859375, + "rewards/code_format_reward/std": 0.3490002751350403, + "rewards/format_reward/mean": 0.8828125, + "rewards/format_reward/std": 0.322907418012619, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 1808.0, + "completions/max_terminated_length": 1808.0, + "completions/mean_length": 561.75, + "completions/mean_terminated_length": 551.93701171875, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 2.1379310344827585, + "frac_reward_zero_std": 0.6875, + "grad_norm": 42.970609061888695, + "kl": 3.5179443359375, + "learning_rate": 4.990754567919917e-06, + "loss": 0.0717, + "num_tokens": 6932174.0, + "reward": 0.18828123807907104, + "reward_std": 0.02506173402070999, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.9453125, + "rewards/format_reward/std": 0.22826264798641205, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 1528.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 560.6953125, + "completions/mean_terminated_length": 544.1370849609375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 2.206896551724138, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.8351996778137165, + "kl": 0.4176025390625, + "learning_rate": 4.989387697624881e-06, + "loss": 0.0621, + "num_tokens": 7135015.0, + "reward": 0.1875, + "reward_std": 0.02130674012005329, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.24301259219646454, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.84375, + "completions/max_length": 3781.0, + "completions/max_terminated_length": 1634.0, + "completions/mean_length": 605.7578125, + "completions/mean_terminated_length": 553.1869506835938, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 2.2758620689655173, + "frac_reward_zero_std": 0.375, + "grad_norm": 9.39713171992309, + "kl": 0.841064453125, + "learning_rate": 4.987926867107095e-06, + "loss": 0.0963, + "num_tokens": 7342528.0, + "reward": 0.18203124403953552, + "reward_std": 0.04166591912508011, + "rewards/code_format_reward/mean": 0.890625, + "rewards/code_format_reward/std": 0.31333550810813904, + "rewards/format_reward/mean": 0.9296875, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 1575.0, + "completions/max_terminated_length": 1575.0, + "completions/mean_length": 561.1953125, + "completions/mean_terminated_length": 548.1825561523438, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 2.344827586206897, + "frac_reward_zero_std": 0.625, + "grad_norm": 18.13031332736665, + "kl": 0.6373291015625, + "learning_rate": 4.986372137660078e-06, + "loss": 0.0154, + "num_tokens": 7544289.0, + "reward": 0.18906250596046448, + "reward_std": 0.024831000715494156, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 1737.0, + "completions/max_terminated_length": 1737.0, + "completions/mean_length": 586.734375, + "completions/mean_terminated_length": 562.1370849609375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 2.413793103448276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 615.3181219972676, + "kl": 37.55224609375, + "learning_rate": 4.984723574517165e-06, + "loss": 0.4356, + "num_tokens": 7749295.0, + "reward": 0.18125000596046448, + "reward_std": 0.043191660195589066, + "rewards/code_format_reward/mean": 0.90625, + "rewards/code_format_reward/std": 0.29262590408325195, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29262590408325195, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 1358.0, + "completions/max_terminated_length": 1330.0, + "completions/mean_length": 563.5, + "completions/mean_terminated_length": 550.9761962890625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 2.4827586206896552, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.5637978091750905, + "kl": 0.08837890625, + "learning_rate": 4.9829812468487655e-06, + "loss": 0.0903, + "num_tokens": 7952495.0, + "reward": 0.18125000596046448, + "reward_std": 0.047710247337818146, + "rewards/code_format_reward/mean": 0.8984375, + "rewards/code_format_reward/std": 0.3032590448856354, + "rewards/format_reward/mean": 0.9140625, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 1187.0, + "completions/max_terminated_length": 1187.0, + "completions/mean_length": 517.3671875, + "completions/mean_terminated_length": 517.3671875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 2.5517241379310347, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5349360457928953, + "kl": 0.1214599609375, + "learning_rate": 4.981145227759457e-06, + "loss": -0.0018, + "num_tokens": 8149790.0, + "reward": 0.18828125298023224, + "reward_std": 0.03314562886953354, + "rewards/code_format_reward/mean": 0.9296875, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 2934.0, + "completions/max_terminated_length": 2934.0, + "completions/mean_length": 626.6328125, + "completions/mean_terminated_length": 573.9173583984375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 2.6206896551724137, + "frac_reward_zero_std": 0.25, + "grad_norm": 125.2304296272452, + "kl": 20.3221435546875, + "learning_rate": 4.979215594284924e-06, + "loss": 0.3347, + "num_tokens": 8361071.0, + "reward": 0.16093750298023224, + "reward_std": 0.0606289878487587, + "rewards/code_format_reward/mean": 0.78125, + "rewards/code_format_reward/std": 0.41502299904823303, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3787541687488556, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 2936.0, + "completions/max_terminated_length": 2936.0, + "completions/mean_length": 516.2265625, + "completions/mean_terminated_length": 511.8582763671875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 2.689655172413793, + "frac_reward_zero_std": 0.4375, + "grad_norm": 9.576331967136065, + "kl": 0.2928466796875, + "learning_rate": 4.977192427388722e-06, + "loss": 0.096, + "num_tokens": 8558220.0, + "reward": 0.17890626192092896, + "reward_std": 0.0444980263710022, + "rewards/code_format_reward/mean": 0.890625, + "rewards/code_format_reward/std": 0.31333550810813904, + "rewards/format_reward/mean": 0.8984375, + "rewards/format_reward/std": 0.3032590448856354, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 2858.0, + "completions/max_terminated_length": 2858.0, + "completions/mean_length": 571.1640625, + "completions/mean_terminated_length": 516.280029296875, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 2.7586206896551726, + "frac_reward_zero_std": 0.375, + "grad_norm": 85.00285353330676, + "kl": 14.6357421875, + "learning_rate": 4.9750758119588824e-06, + "loss": 0.3169, + "num_tokens": 8762169.0, + "reward": 0.16484375298023224, + "reward_std": 0.057621706277132034, + "rewards/code_format_reward/mean": 0.8203125, + "rewards/code_format_reward/std": 0.3854354918003082, + "rewards/format_reward/mean": 0.828125, + "rewards/format_reward/std": 0.3787541687488556, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 2583.0, + "completions/max_terminated_length": 2583.0, + "completions/mean_length": 599.609375, + "completions/mean_terminated_length": 592.5079956054688, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 2.8275862068965516, + "frac_reward_zero_std": 0.125, + "grad_norm": 5.760759174842388, + "kl": 1.2294921875, + "learning_rate": 4.972865836804349e-06, + "loss": 0.1275, + "num_tokens": 8969991.0, + "reward": 0.16953125596046448, + "reward_std": 0.0626438558101654, + "rewards/code_format_reward/mean": 0.8359375, + "rewards/code_format_reward/std": 0.371787428855896, + "rewards/format_reward/mean": 0.859375, + "rewards/format_reward/std": 0.3490002751350403, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 1428.0, + "completions/max_terminated_length": 1428.0, + "completions/mean_length": 492.015625, + "completions/mean_terminated_length": 492.015625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 2.896551724137931, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5019665707220321, + "kl": 0.1136474609375, + "learning_rate": 4.970562594651254e-06, + "loss": 0.0647, + "num_tokens": 9164041.0, + "reward": 0.18203124403953552, + "reward_std": 0.03968694061040878, + "rewards/code_format_reward/mean": 0.90625, + "rewards/code_format_reward/std": 0.29262590408325195, + "rewards/format_reward/mean": 0.9140625, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 1865.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 566.1640625, + "completions/mean_terminated_length": 549.26611328125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 2.9655172413793105, + "frac_reward_zero_std": 0.0, + "grad_norm": 2.5630106181026244, + "kl": 0.353515625, + "learning_rate": 4.968166182139026e-06, + "loss": 0.0204, + "num_tokens": 9366678.0, + "reward": 0.13671875, + "reward_std": 0.09157585352659225, + "rewards/code_format_reward/mean": 0.6796875, + "rewards/code_format_reward/std": 0.4684300124645233, + "rewards/format_reward/mean": 0.6875, + "rewards/format_reward/std": 0.4653336703777313, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 2240.0, + "completions/max_terminated_length": 2240.0, + "completions/mean_length": 563.0, + "completions/mean_terminated_length": 550.0159301757812, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 3.0689655172413794, + "frac_reward_zero_std": 0.125, + "grad_norm": 0.5947582776397699, + "kl": 0.0906982421875, + "learning_rate": 4.9656766998163306e-06, + "loss": 0.071, + "num_tokens": 9568206.0, + "reward": 0.15625, + "reward_std": 0.07245282828807831, + "rewards/code_format_reward/mean": 0.765625, + "rewards/code_format_reward/std": 0.42527204751968384, + "rewards/format_reward/mean": 0.796875, + "rewards/format_reward/std": 0.40390563011169434, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 1630.0, + "completions/max_terminated_length": 1630.0, + "completions/mean_length": 617.5703125, + "completions/mean_terminated_length": 560.99169921875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 3.1379310344827585, + "frac_reward_zero_std": 0.1875, + "grad_norm": 29.528488713072353, + "kl": 0.284423828125, + "learning_rate": 4.963094252136865e-06, + "loss": 0.1132, + "num_tokens": 9778327.0, + "reward": 0.14531250298023224, + "reward_std": 0.07226449996232986, + "rewards/code_format_reward/mean": 0.71875, + "rewards/code_format_reward/std": 0.4513758420944214, + "rewards/format_reward/mean": 0.734375, + "rewards/format_reward/std": 0.44340085983276367, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 2568.0, + "completions/max_terminated_length": 2568.0, + "completions/mean_length": 523.90625, + "completions/mean_terminated_length": 516.9448852539062, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 3.206896551724138, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7663643054090497, + "kl": 0.1002197265625, + "learning_rate": 4.960418947454958e-06, + "loss": 0.167, + "num_tokens": 9974411.0, + "reward": 0.16171875596046448, + "reward_std": 0.07921412587165833, + "rewards/code_format_reward/mean": 0.796875, + "rewards/code_format_reward/std": 0.40390563011169434, + "rewards/format_reward/mean": 0.8203125, + "rewards/format_reward/std": 0.3854354918003082, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 1431.0, + "completions/max_terminated_length": 1431.0, + "completions/mean_length": 482.03125, + "completions/mean_terminated_length": 482.03125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 3.2758620689655173, + "frac_reward_zero_std": 0.25, + "grad_norm": 0.6031669012487536, + "kl": 0.10595703125, + "learning_rate": 4.957650898021038e-06, + "loss": -0.0048, + "num_tokens": 10167183.0, + "reward": 0.17500001192092896, + "reward_std": 0.05759534612298012, + "rewards/code_format_reward/mean": 0.8671875, + "rewards/code_format_reward/std": 0.3407054841518402, + "rewards/format_reward/mean": 0.8828125, + "rewards/format_reward/std": 0.322907418012619, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 3389.0, + "completions/max_terminated_length": 1360.0, + "completions/mean_length": 540.1953125, + "completions/mean_terminated_length": 517.7637939453125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 3.344827586206897, + "frac_reward_zero_std": 0.3125, + "grad_norm": 0.6402833424208365, + "kl": 0.107177734375, + "learning_rate": 4.954790219976915e-06, + "loss": 0.1032, + "num_tokens": 10367400.0, + "reward": 0.17734375596046448, + "reward_std": 0.05118774622678757, + "rewards/code_format_reward/mean": 0.8828125, + "rewards/code_format_reward/std": 0.322907418012619, + "rewards/format_reward/mean": 0.890625, + "rewards/format_reward/std": 0.31333550810813904, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 2281.0, + "completions/max_terminated_length": 2281.0, + "completions/mean_length": 535.6015625, + "completions/mean_terminated_length": 530.2125854492188, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 3.413793103448276, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5622681425626702, + "kl": 0.0972900390625, + "learning_rate": 4.95183703335091e-06, + "loss": 0.0237, + "num_tokens": 10566797.0, + "reward": 0.18281251192092896, + "reward_std": 0.04250866919755936, + "rewards/code_format_reward/mean": 0.9140625, + "rewards/code_format_reward/std": 0.2813730239868164, + "rewards/format_reward/mean": 0.9140625, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 1695.0, + "completions/max_terminated_length": 1695.0, + "completions/mean_length": 516.953125, + "completions/mean_terminated_length": 503.94403076171875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 3.4827586206896552, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.1308403779181986, + "kl": 0.2667236328125, + "learning_rate": 4.948791462052819e-06, + "loss": 0.1069, + "num_tokens": 10762871.0, + "reward": 0.18437500298023224, + "reward_std": 0.03237384930253029, + "rewards/code_format_reward/mean": 0.9140625, + "rewards/code_format_reward/std": 0.2813730239868164, + "rewards/format_reward/mean": 0.9296875, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.84375, + "completions/max_length": 2243.0, + "completions/max_terminated_length": 1080.0, + "completions/mean_length": 526.25, + "completions/mean_terminated_length": 483.9837341308594, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 3.5517241379310347, + "frac_reward_zero_std": 0.5625, + "grad_norm": 15.23470491238641, + "kl": 1.3590087890625, + "learning_rate": 4.945653633868716e-06, + "loss": 0.1142, + "num_tokens": 10961303.0, + "reward": 0.18125000596046448, + "reward_std": 0.035247981548309326, + "rewards/code_format_reward/mean": 0.90625, + "rewards/code_format_reward/std": 0.29262590408325195, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.29262590408325195, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 475.140625, + "completions/mean_terminated_length": 475.140625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 3.6206896551724137, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.41814078259259574, + "kl": 0.1163330078125, + "learning_rate": 4.942423680455584e-06, + "loss": 0.0046, + "num_tokens": 11153193.0, + "reward": 0.19218748807907104, + "reward_std": 0.02057085558772087, + "rewards/code_format_reward/mean": 0.953125, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 1911.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 493.4765625, + "completions/mean_terminated_length": 482.3149719238281, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 3.689655172413793, + "frac_reward_zero_std": 0.75, + "grad_norm": 1664.6862364926953, + "kl": 100.5736083984375, + "learning_rate": 4.939101737335802e-06, + "loss": 1.0242, + "num_tokens": 11347430.0, + "reward": 0.18984374403953552, + "reward_std": 0.020350992679595947, + "rewards/code_format_reward/mean": 0.9453125, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 1640.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 523.0625, + "completions/mean_terminated_length": 516.1181030273438, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 3.7586206896551726, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.42621486826703325, + "kl": 0.127197265625, + "learning_rate": 4.935687943891447e-06, + "loss": -0.0541, + "num_tokens": 11545454.0, + "reward": 0.19296875596046448, + "reward_std": 0.016834918409585953, + "rewards/code_format_reward/mean": 0.953125, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 3578.0, + "completions/max_terminated_length": 1532.0, + "completions/mean_length": 564.2109375, + "completions/mean_terminated_length": 499.84423828125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 3.8275862068965516, + "frac_reward_zero_std": 0.75, + "grad_norm": 78.37140371828227, + "kl": 12.0, + "learning_rate": 4.932182443358458e-06, + "loss": 0.2436, + "num_tokens": 11747649.0, + "reward": 0.18828125298023224, + "reward_std": 0.01909703202545643, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.9453125, + "rewards/format_reward/std": 0.22826264798641205, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 1486.0, + "completions/max_terminated_length": 1486.0, + "completions/mean_length": 518.0234375, + "completions/mean_terminated_length": 503.9031982421875, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 3.896551724137931, + "frac_reward_zero_std": 0.625, + "grad_norm": 8.921711491174781, + "kl": 0.603759765625, + "learning_rate": 4.928585382820616e-06, + "loss": 0.0247, + "num_tokens": 11945028.0, + "reward": 0.18437500298023224, + "reward_std": 0.031300365924835205, + "rewards/code_format_reward/mean": 0.921875, + "rewards/code_format_reward/std": 0.2694226801395416, + "rewards/format_reward/mean": 0.921875, + "rewards/format_reward/std": 0.2694226801395416, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 3627.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 525.0078125, + "completions/mean_terminated_length": 491.5040283203125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 3.9655172413793105, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5156563431137091, + "kl": 0.16015625, + "learning_rate": 4.924896913203376e-06, + "loss": 0.0297, + "num_tokens": 12143301.0, + "reward": 0.19453126192092896, + "reward_std": 0.012415500357747078, + "rewards/code_format_reward/mean": 0.96875, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 3213.0, + "completions/max_terminated_length": 1010.0, + "completions/mean_length": 519.9921875, + "completions/mean_terminated_length": 462.96002197265625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 4.068965517241379, + "frac_reward_zero_std": 0.6875, + "grad_norm": 8.189691709912301, + "kl": 4.8702392578125, + "learning_rate": 4.921117189267535e-06, + "loss": 0.2236, + "num_tokens": 12340932.0, + "reward": 0.19140625, + "reward_std": 0.022327817976474762, + "rewards/code_format_reward/mean": 0.953125, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.9609375, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 3751.0, + "completions/max_terminated_length": 1646.0, + "completions/mean_length": 581.3671875, + "completions/mean_terminated_length": 556.409423828125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 4.137931034482759, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5406944658569534, + "kl": 0.107666015625, + "learning_rate": 4.917246369602742e-06, + "loss": 0.0798, + "num_tokens": 12546419.0, + "reward": 0.18906250596046448, + "reward_std": 0.02414703369140625, + "rewards/code_format_reward/mean": 0.9453125, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.9453125, + "rewards/format_reward/std": 0.22826264798641205, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 1408.0, + "completions/max_terminated_length": 1216.0, + "completions/mean_length": 454.96875, + "completions/mean_terminated_length": 447.4645690917969, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 4.206896551724138, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.33497177183170196, + "kl": 0.1175537109375, + "learning_rate": 4.9132846166208355e-06, + "loss": -0.0154, + "num_tokens": 12735495.0, + "reward": 0.19374999403953552, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.9609375, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 1474.0, + "completions/max_terminated_length": 1474.0, + "completions/mean_length": 507.015625, + "completions/mean_terminated_length": 500.3149719238281, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 4.275862068965517, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4596870009961427, + "kl": 0.1395263671875, + "learning_rate": 4.9092320965490365e-06, + "loss": 0.032, + "num_tokens": 12931465.0, + "reward": 0.18984374403953552, + "reward_std": 0.022621294483542442, + "rewards/code_format_reward/mean": 0.9453125, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 486.4296875, + "completions/mean_terminated_length": 459.25982666015625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 4.344827586206897, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.526302411386141, + "kl": 0.135498046875, + "learning_rate": 4.905088979422971e-06, + "loss": 0.071, + "num_tokens": 13123896.0, + "reward": 0.19296875596046448, + "reward_std": 0.017908399924635887, + "rewards/code_format_reward/mean": 0.9609375, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.96875, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3627.0, + "completions/max_terminated_length": 1327.0, + "completions/mean_length": 511.5625, + "completions/mean_terminated_length": 442.70965576171875, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 4.413793103448276, + "frac_reward_zero_std": 0.625, + "grad_norm": 74.57382104056599, + "kl": 11.362548828125, + "learning_rate": 4.900855439079536e-06, + "loss": 0.222, + "num_tokens": 13319280.0, + "reward": 0.18906250596046448, + "reward_std": 0.02590448409318924, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.953125, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3933.0, + "completions/max_terminated_length": 1508.0, + "completions/mean_length": 539.40625, + "completions/mean_terminated_length": 433.0322570800781, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 4.482758620689655, + "frac_reward_zero_std": 0.4375, + "grad_norm": 19.50917337775108, + "kl": 2.5927734375, + "learning_rate": 4.8965316531496055e-06, + "loss": 0.2921, + "num_tokens": 13519396.0, + "reward": 0.18515625596046448, + "reward_std": 0.03893200308084488, + "rewards/code_format_reward/mean": 0.921875, + "rewards/code_format_reward/std": 0.2694226801395416, + "rewards/format_reward/mean": 0.9296875, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 2423.0, + "completions/max_terminated_length": 2423.0, + "completions/mean_length": 428.5859375, + "completions/mean_terminated_length": 428.5859375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 4.551724137931035, + "frac_reward_zero_std": 0.625, + "grad_norm": 5.412446385383582, + "kl": 0.87109375, + "learning_rate": 4.892117803050578e-06, + "loss": -0.0089, + "num_tokens": 13705327.0, + "reward": 0.18906250596046448, + "reward_std": 0.020641827955842018, + "rewards/code_format_reward/mean": 0.90625, + "rewards/code_format_reward/std": 0.29262590408325195, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 1341.0, + "completions/max_terminated_length": 970.0, + "completions/mean_length": 452.921875, + "completions/mean_terminated_length": 440.9762268066406, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 4.620689655172414, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5397669168631566, + "kl": 0.194580078125, + "learning_rate": 4.887614073978761e-06, + "loss": -0.011, + "num_tokens": 13894373.0, + "reward": 0.19140625, + "reward_std": 0.020348839461803436, + "rewards/code_format_reward/mean": 0.9375, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3615.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 534.53125, + "completions/mean_terminated_length": 447.82257080078125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 4.689655172413794, + "frac_reward_zero_std": 0.625, + "grad_norm": 91.20082626936673, + "kl": 36.480224609375, + "learning_rate": 4.883020654901609e-06, + "loss": 0.4063, + "num_tokens": 14093865.0, + "reward": 0.19296875596046448, + "reward_std": 0.019887376576662064, + "rewards/code_format_reward/mean": 0.953125, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3565.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 522.640625, + "completions/mean_terminated_length": 433.25, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 4.758620689655173, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1158.116969084908, + "kl": 202.0533447265625, + "learning_rate": 4.878337738549785e-06, + "loss": 2.2702, + "num_tokens": 14291395.0, + "reward": 0.1875, + "reward_std": 0.030323900282382965, + "rewards/code_format_reward/mean": 0.9296875, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.9453125, + "rewards/format_reward/std": 0.22826264798641205, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -3.0, + "completions/max_length": 1948.0, + "completions/max_terminated_length": 1948.0, + "completions/mean_length": 479.265625, + "completions/mean_terminated_length": 479.265625, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 4.827586206896552, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8904557320426325, + "kl": 0.16650390625, + "learning_rate": 4.873565521409082e-06, + "loss": 0.0518, + "num_tokens": 14481549.0, + "reward": 0.1953125, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.9765625, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3196.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 620.3203125, + "completions/mean_terminated_length": 522.8416748046875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 4.896551724137931, + "frac_reward_zero_std": 0.375, + "grad_norm": 4.105422981049957, + "kl": 3.436767578125, + "learning_rate": 4.868704203712173e-06, + "loss": 0.3129, + "num_tokens": 14690878.0, + "reward": 0.17500001192092896, + "reward_std": 0.049049004912376404, + "rewards/code_format_reward/mean": 0.8671875, + "rewards/code_format_reward/std": 0.3407054841518402, + "rewards/format_reward/mean": 0.8828125, + "rewards/format_reward/std": 0.322907418012619, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.28125, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 3032.0, + "completions/mean_length": 1178.859375, + "completions/mean_terminated_length": 621.0952758789062, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 4.9655172413793105, + "frac_reward_zero_std": 0.0625, + "grad_norm": 213.86405401091332, + "kl": 98.63232421875, + "learning_rate": 4.86375398943021e-06, + "loss": 1.5566, + "num_tokens": 14972844.0, + "reward": 0.14218750596046448, + "reward_std": 0.08679269254207611, + "rewards/code_format_reward/mean": 0.7109375, + "rewards/code_format_reward/std": 0.45510825514793396, + "rewards/format_reward/mean": 0.7109375, + "rewards/format_reward/std": 0.45510825514793396, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.5, + "completions/max_length": 3781.0, + "completions/max_terminated_length": 2397.0, + "completions/mean_length": 1726.734375, + "completions/mean_terminated_length": 699.8500366210938, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 5.068965517241379, + "frac_reward_zero_std": 0.0, + "grad_norm": 172.5869365555856, + "kl": 94.25, + "learning_rate": 4.858715086264274e-06, + "loss": 1.477, + "num_tokens": 15324938.0, + "reward": 0.09375, + "reward_std": 0.10150270909070969, + "rewards/code_format_reward/mean": 0.4609375, + "rewards/code_format_reward/std": 0.5004304051399231, + "rewards/format_reward/mean": 0.4765625, + "rewards/format_reward/std": 0.5014128684997559, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3436.0, + "completions/mean_length": 2458.3125, + "completions/mean_terminated_length": 1035.1429443359375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 5.137931034482759, + "frac_reward_zero_std": 0.0625, + "grad_norm": 57.25662177745869, + "kl": 40.15625, + "learning_rate": 4.853587705636646e-06, + "loss": 0.8415, + "num_tokens": 15770234.0, + "reward": 0.05624999850988388, + "reward_std": 0.0847882479429245, + "rewards/code_format_reward/mean": 0.2734375, + "rewards/code_format_reward/std": 0.447474867105484, + "rewards/format_reward/mean": 0.2890625, + "rewards/format_reward/std": 0.45510825514793396, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.34375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3138.0, + "completions/mean_length": 2853.59375, + "completions/mean_terminated_length": 974.1162719726562, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "epoch": 5.206896551724138, + "frac_reward_zero_std": 0.25, + "grad_norm": 13.862558890178295, + "kl": 12.4921875, + "learning_rate": 4.84837206268195e-06, + "loss": 0.3769, + "num_tokens": 16265422.0, + "reward": 0.03203125298023224, + "reward_std": 0.06078151986002922, + "rewards/code_format_reward/mean": 0.15625, + "rewards/code_format_reward/std": 0.3645188808441162, + "rewards/format_reward/mean": 0.1640625, + "rewards/format_reward/std": 0.371787428855896, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 1657.0, + "completions/mean_length": 3360.890625, + "completions/mean_terminated_length": 807.9199829101562, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 5.275862068965517, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1593085629950635, + "kl": 1.6650390625, + "learning_rate": 4.8430683762381195e-06, + "loss": 0.1107, + "num_tokens": 16826688.0, + "reward": 0.012500000186264515, + "reward_std": 0.02925041876733303, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.24301259219646454, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2445.0, + "completions/mean_length": 3014.8125, + "completions/mean_terminated_length": 767.2142944335938, + "completions/min_length": 276.0, + "completions/min_terminated_length": 276.0, + "epoch": 5.344827586206897, + "frac_reward_zero_std": 0.375, + "grad_norm": 3.9131053387344386, + "kl": 0.572265625, + "learning_rate": 4.837676868837213e-06, + "loss": 0.1534, + "num_tokens": 17342752.0, + "reward": 0.02734375, + "reward_std": 0.04960141330957413, + "rewards/code_format_reward/mean": 0.1328125, + "rewards/code_format_reward/std": 0.3407054841518402, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3490002751350403, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2068.0, + "completions/mean_length": 3525.0703125, + "completions/mean_terminated_length": 1183.3157958984375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 5.413793103448276, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.7497217743483526, + "kl": 0.58984375, + "learning_rate": 4.832197766696085e-06, + "loss": 0.0596, + "num_tokens": 17923705.0, + "reward": 0.008593750186264515, + "reward_std": 0.01593157649040222, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.4375, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 3009.0, + "completions/mean_length": 3445.8515625, + "completions/mean_terminated_length": 1025.0555419921875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 5.482758620689655, + "frac_reward_zero_std": 0.5625, + "grad_norm": 10.680361044962455, + "kl": 0.8134765625, + "learning_rate": 4.826631299706887e-06, + "loss": 0.1354, + "num_tokens": 18495846.0, + "reward": 0.01406249962747097, + "reward_std": 0.0329858660697937, + "rewards/code_format_reward/mean": 0.0703125, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.34375, + "completions/max_length": 3911.0, + "completions/max_terminated_length": 2769.0, + "completions/mean_length": 3161.1484375, + "completions/mean_terminated_length": 1222.09521484375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 5.551724137931035, + "frac_reward_zero_std": 0.6875, + "grad_norm": 5.863415608971705, + "kl": 1.5400390625, + "learning_rate": 4.820977701427424e-06, + "loss": 0.0927, + "num_tokens": 19031545.0, + "reward": 0.008593750186264515, + "reward_std": 0.022327817976474762, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 3779.0, + "completions/max_terminated_length": 3281.0, + "completions/mean_length": 3447.84375, + "completions/mean_terminated_length": 1468.666748046875, + "completions/min_length": 369.0, + "completions/min_terminated_length": 369.0, + "epoch": 5.620689655172414, + "frac_reward_zero_std": 0.6875, + "grad_norm": 44.120715406394496, + "kl": 13.8203125, + "learning_rate": 4.81523720907136e-06, + "loss": 0.1843, + "num_tokens": 19603941.0, + "reward": 0.0078125, + "reward_std": 0.019044626504182816, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5625, + "completions/max_length": 3712.0, + "completions/max_terminated_length": 1597.0, + "completions/mean_length": 3321.03125, + "completions/mean_terminated_length": 975.4285888671875, + "completions/min_length": 329.0, + "completions/min_terminated_length": 329.0, + "epoch": 5.689655172413794, + "frac_reward_zero_std": 0.8125, + "grad_norm": 60.204694884732945, + "kl": 21.171875, + "learning_rate": 4.809410063498254e-06, + "loss": 0.2614, + "num_tokens": 20160105.0, + "reward": 0.004687500186264515, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.53125, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 3121.0, + "completions/mean_length": 3610.2421875, + "completions/mean_terminated_length": 1546.3333740234375, + "completions/min_length": 383.0, + "completions/min_terminated_length": 383.0, + "epoch": 5.758620689655173, + "frac_reward_zero_std": 0.9375, + "grad_norm": 20.14460667732567, + "kl": 7.09375, + "learning_rate": 4.8034965092034656e-06, + "loss": 0.0843, + "num_tokens": 20753288.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2531.0, + "completions/mean_length": 3683.359375, + "completions/mean_terminated_length": 1158.8333740234375, + "completions/min_length": 527.0, + "completions/min_terminated_length": 527.0, + "epoch": 5.827586206896552, + "frac_reward_zero_std": 0.75, + "grad_norm": 14.12037911937169, + "kl": 5.85546875, + "learning_rate": 4.797496794307889e-06, + "loss": 0.1153, + "num_tokens": 21354662.0, + "reward": 0.0078125, + "reward_std": 0.019044626504182816, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.6875, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 2420.0, + "completions/mean_length": 3650.7734375, + "completions/mean_terminated_length": 1109.300048828125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 5.896551724137931, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9898362940499612, + "kl": 0.728515625, + "learning_rate": 4.791411170547545e-06, + "loss": 0.0412, + "num_tokens": 21953033.0, + "reward": 0.007031249813735485, + "reward_std": 0.013782460242509842, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2635.0, + "completions/mean_length": 3448.1015625, + "completions/mean_terminated_length": 1277.0, + "completions/min_length": 317.0, + "completions/min_terminated_length": 317.0, + "epoch": 5.9655172413793105, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.3365281655016754, + "kl": 0.44677734375, + "learning_rate": 4.785239893263017e-06, + "loss": 0.0226, + "num_tokens": 22524294.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2837.0, + "completions/mean_length": 3402.4609375, + "completions/mean_terminated_length": 1330.84619140625, + "completions/min_length": 367.0, + "completions/min_terminated_length": 367.0, + "epoch": 6.068965517241379, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.9994644728808741, + "kl": 0.41796875, + "learning_rate": 4.778983221388742e-06, + "loss": 0.0222, + "num_tokens": 23089737.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.5, + "completions/max_length": 4078.0, + "completions/max_terminated_length": 3937.0, + "completions/mean_length": 3424.5546875, + "completions/mean_terminated_length": 1120.1875, + "completions/min_length": 259.0, + "completions/min_terminated_length": 259.0, + "epoch": 6.137931034482759, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.7994480278597837, + "kl": 0.7373046875, + "learning_rate": 4.77264141744214e-06, + "loss": 0.0257, + "num_tokens": 23659152.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 3255.0, + "completions/mean_length": 3615.6796875, + "completions/mean_terminated_length": 1483.8333740234375, + "completions/min_length": 640.0, + "completions/min_terminated_length": 640.0, + "epoch": 6.206896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2463644130168377, + "kl": 0.3974609375, + "learning_rate": 4.766214747512603e-06, + "loss": 0.004, + "num_tokens": 24253031.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3220.0, + "completions/mean_length": 3160.4765625, + "completions/mean_terminated_length": 1251.0526123046875, + "completions/min_length": 337.0, + "completions/min_terminated_length": 337.0, + "epoch": 6.275862068965517, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7416070034458621, + "kl": 0.56201171875, + "learning_rate": 4.759703481250331e-06, + "loss": 0.0459, + "num_tokens": 24788412.0, + "reward": 0.004687500186264515, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.40625, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 3619.0, + "completions/mean_length": 3303.9453125, + "completions/mean_terminated_length": 1231.5789794921875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "epoch": 6.344827586206897, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4067648831787075, + "kl": 1.296875, + "learning_rate": 4.753107891855015e-06, + "loss": 0.0523, + "num_tokens": 25342389.0, + "reward": 0.004687500186264515, + "reward_std": 0.010205792263150215, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.15625, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 3598.0, + "completions/mean_length": 3240.4765625, + "completions/mean_terminated_length": 1464.7838134765625, + "completions/min_length": 221.0, + "completions/min_terminated_length": 221.0, + "epoch": 6.413793103448276, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.5773830436721306, + "kl": 0.8857421875, + "learning_rate": 4.746428256064375e-06, + "loss": 0.0271, + "num_tokens": 25888242.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3607.0, + "completions/mean_length": 3247.8046875, + "completions/mean_terminated_length": 1320.7059326171875, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 6.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.47019129510941726, + "kl": 0.45703125, + "learning_rate": 4.7396648541425534e-06, + "loss": 0.0046, + "num_tokens": 26433865.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2806.0, + "completions/mean_length": 3004.3828125, + "completions/mean_terminated_length": 1139.6591796875, + "completions/min_length": 281.0, + "completions/min_terminated_length": 281.0, + "epoch": 6.551724137931035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12969886319345608, + "kl": 0.681640625, + "learning_rate": 4.732817969868348e-06, + "loss": 0.0068, + "num_tokens": 26948594.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.46875, + "completions/max_length": 3961.0, + "completions/max_terminated_length": 2790.0, + "completions/mean_length": 2860.5859375, + "completions/mean_terminated_length": 1184.319091796875, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 6.620689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.572939086350903, + "kl": 0.71875, + "learning_rate": 4.7258878905233095e-06, + "loss": 0.0072, + "num_tokens": 27445821.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 4015.0, + "completions/mean_length": 2526.8984375, + "completions/mean_terminated_length": 1347.7257080078125, + "completions/min_length": 51.0, + "completions/min_terminated_length": 51.0, + "epoch": 6.689655172413794, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.6818093659965091, + "kl": 0.4169921875, + "learning_rate": 4.718874906879688e-06, + "loss": 0.023, + "num_tokens": 27899240.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3497.0, + "completions/mean_length": 1971.0546875, + "completions/mean_terminated_length": 1234.521728515625, + "completions/min_length": 37.0, + "completions/min_terminated_length": 37.0, + "epoch": 6.758620689655173, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5461191185776753, + "kl": 0.300048828125, + "learning_rate": 4.711779313188231e-06, + "loss": 0.0228, + "num_tokens": 28282167.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 3684.0, + "completions/max_terminated_length": 3134.0, + "completions/mean_length": 2307.0703125, + "completions/mean_terminated_length": 1120.921875, + "completions/min_length": 69.0, + "completions/min_terminated_length": 69.0, + "epoch": 6.827586206896552, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8120148624300454, + "kl": 0.53271484375, + "learning_rate": 4.70460140716584e-06, + "loss": 0.0365, + "num_tokens": 28708544.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.90625, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 3661.0, + "completions/mean_length": 2056.6484375, + "completions/mean_terminated_length": 1348.0107421875, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 6.896551724137931, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1307146547923295, + "kl": 0.3037109375, + "learning_rate": 4.697341489983076e-06, + "loss": 0.0048, + "num_tokens": 29102867.0, + "reward": 0.0054687499068677425, + "reward_std": 0.015467959456145763, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3552.0, + "completions/mean_length": 2191.359375, + "completions/mean_terminated_length": 1324.2047119140625, + "completions/min_length": 34.0, + "completions/min_terminated_length": 34.0, + "epoch": 6.9655172413793105, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.28935824632623103, + "kl": 0.521484375, + "learning_rate": 4.6899998662515215e-06, + "loss": 0.0193, + "num_tokens": 29513265.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.875, + "completions/max_length": 3855.0, + "completions/max_terminated_length": 2728.0, + "completions/mean_length": 1871.7890625, + "completions/mean_terminated_length": 1163.478271484375, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 7.068965517241379, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.2101373691948605, + "kl": 1.4599609375, + "learning_rate": 4.682576844011007e-06, + "loss": 0.0146, + "num_tokens": 29883926.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3237.0, + "completions/mean_length": 1758.7578125, + "completions/mean_terminated_length": 1119.5208740234375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 7.137931034482759, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.913814548832671, + "kl": 0.5185546875, + "learning_rate": 4.675072734716678e-06, + "loss": 0.0172, + "num_tokens": 30238047.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.21875, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 3417.0, + "completions/mean_length": 1792.265625, + "completions/mean_terminated_length": 1326.1748046875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 7.206896551724138, + "frac_reward_zero_std": 0.9375, + "grad_norm": 5.294271080104991, + "kl": 2.69140625, + "learning_rate": 4.667487853225931e-06, + "loss": 0.0362, + "num_tokens": 30598529.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.34375, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 3415.0, + "completions/mean_length": 1649.203125, + "completions/mean_terminated_length": 1282.2523193359375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 7.275862068965517, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.3488144201379413, + "kl": 0.72216796875, + "learning_rate": 4.659822517785203e-06, + "loss": 0.0245, + "num_tokens": 30940699.0, + "reward": 0.00390625, + "reward_std": 0.011048542335629463, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3750.0, + "completions/mean_length": 1611.8125, + "completions/mean_terminated_length": 1136.2735595703125, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 7.344827586206897, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.36195644471317184, + "kl": 0.6396484375, + "learning_rate": 4.6520770500166165e-06, + "loss": -0.0007, + "num_tokens": 31276915.0, + "reward": 0.0023437500931322575, + "reward_std": 0.00662912568077445, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2910.0, + "completions/mean_length": 1516.9765625, + "completions/mean_terminated_length": 1113.4537353515625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 7.413793103448276, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.006734292231092, + "kl": 0.8564453125, + "learning_rate": 4.644251774904487e-06, + "loss": 0.014, + "num_tokens": 31601720.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.375, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 4004.0, + "completions/mean_length": 1549.9921875, + "completions/mean_terminated_length": 1126.2037353515625, + "completions/min_length": 58.0, + "completions/min_terminated_length": 58.0, + "epoch": 7.482758620689655, + "frac_reward_zero_std": 0.9375, + "grad_norm": 3.402705549923824, + "kl": 0.48681640625, + "learning_rate": 4.636347020781684e-06, + "loss": -0.0058, + "num_tokens": 31930095.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.34375, + "completions/max_length": 3865.0, + "completions/max_terminated_length": 3740.0, + "completions/mean_length": 1463.078125, + "completions/mean_terminated_length": 1016.1494750976562, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 7.551724137931035, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04885965524357822, + "kl": 0.261962890625, + "learning_rate": 4.6283631193158605e-06, + "loss": 0.0026, + "num_tokens": 32248441.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 3005.0, + "completions/mean_length": 1335.765625, + "completions/mean_terminated_length": 1070.67822265625, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 7.620689655172414, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.10929604136729178, + "kl": 0.15478515625, + "learning_rate": 4.620300405495532e-06, + "loss": 0.0117, + "num_tokens": 32550491.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3570.0, + "completions/mean_length": 1237.7109375, + "completions/mean_terminated_length": 1018.67822265625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 7.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.034299438449552, + "kl": 0.1138916015625, + "learning_rate": 4.612159217616022e-06, + "loss": 0.0011, + "num_tokens": 32838846.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.3125, + "completions/max_length": 3574.0, + "completions/max_terminated_length": 3104.0, + "completions/mean_length": 1457.6875, + "completions/mean_terminated_length": 1041.4716796875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 7.758620689655173, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3699601876805936, + "kl": 0.6552734375, + "learning_rate": 4.603939897265268e-06, + "loss": 0.0401, + "num_tokens": 33156502.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 3954.0, + "completions/max_terminated_length": 3755.0, + "completions/mean_length": 1153.484375, + "completions/mean_terminated_length": 949.0689697265625, + "completions/min_length": 29.0, + "completions/min_terminated_length": 29.0, + "epoch": 7.827586206896552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08969434421826968, + "kl": 0.30517578125, + "learning_rate": 4.595642789309492e-06, + "loss": 0.0031, + "num_tokens": 33435220.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 3844.0, + "completions/max_terminated_length": 3298.0, + "completions/mean_length": 1188.6953125, + "completions/mean_terminated_length": 964.1206665039062, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 7.896551724137931, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.25288165833131593, + "kl": 0.2178955078125, + "learning_rate": 4.587268241878724e-06, + "loss": 0.0172, + "num_tokens": 33718213.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 2833.0, + "completions/mean_length": 1103.6796875, + "completions/mean_terminated_length": 940.9827270507812, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 7.9655172413793105, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3612955105647937, + "kl": 0.3759765625, + "learning_rate": 4.578816606352205e-06, + "loss": 0.0199, + "num_tokens": 33990556.0, + "reward": 0.0078125, + "reward_std": 0.022097084671258926, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3685.0, + "completions/max_terminated_length": 3685.0, + "completions/mean_length": 1020.4765625, + "completions/mean_terminated_length": 969.6209106445312, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 8.068965517241379, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.17683694538045752, + "kl": 0.17919921875, + "learning_rate": 4.570288237343632e-06, + "loss": 0.003, + "num_tokens": 34252249.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 3087.0, + "completions/max_terminated_length": 2594.0, + "completions/mean_length": 1116.2421875, + "completions/mean_terminated_length": 948.6724243164062, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 8.137931034482758, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3409942466350483, + "kl": 0.39453125, + "learning_rate": 4.561683492686289e-06, + "loss": -0.0195, + "num_tokens": 34525056.0, + "reward": 0.0062500000931322575, + "reward_std": 0.015698691830039024, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3675.0, + "completions/max_terminated_length": 2709.0, + "completions/mean_length": 1004.4140625, + "completions/mean_terminated_length": 899.4417114257812, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 8.206896551724139, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6946015343722765, + "kl": 0.14532470703125, + "learning_rate": 4.5530027334180285e-06, + "loss": 0.01, + "num_tokens": 34783789.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 3457.0, + "completions/max_terminated_length": 2343.0, + "completions/mean_length": 985.4453125, + "completions/mean_terminated_length": 855.6693725585938, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 8.275862068965518, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.0595839352292757, + "kl": 0.18084716796875, + "learning_rate": 4.544246323766122e-06, + "loss": 0.0144, + "num_tokens": 35040998.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.28125, + "completions/max_length": 3627.0, + "completions/max_terminated_length": 3515.0, + "completions/mean_length": 1370.171875, + "completions/mean_terminated_length": 955.2857666015625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 8.344827586206897, + "frac_reward_zero_std": 0.9375, + "grad_norm": 4.254526071210821, + "kl": 0.466796875, + "learning_rate": 4.535414631131983e-06, + "loss": -0.0032, + "num_tokens": 35347452.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.71875, + "completions/max_length": 3075.0, + "completions/max_terminated_length": 3075.0, + "completions/mean_length": 1082.421875, + "completions/mean_terminated_length": 972.6387329101562, + "completions/min_length": 26.0, + "completions/min_terminated_length": 26.0, + "epoch": 8.413793103448276, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3143490105876032, + "kl": 0.19195556640625, + "learning_rate": 4.526508026075746e-06, + "loss": -0.0017, + "num_tokens": 35617074.0, + "reward": 0.00390625, + "reward_std": 0.011048542335629463, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 3420.0, + "completions/max_terminated_length": 3024.0, + "completions/mean_length": 1132.328125, + "completions/mean_terminated_length": 905.4086303710938, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 8.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1851143243039512, + "kl": 0.273681640625, + "learning_rate": 4.517526882300721e-06, + "loss": 0.0027, + "num_tokens": 35891916.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3865.0, + "completions/max_terminated_length": 2632.0, + "completions/mean_length": 1070.8515625, + "completions/mean_terminated_length": 909.5000610351562, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 8.551724137931034, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0980935432999941, + "kl": 0.21307373046875, + "learning_rate": 4.508471576637713e-06, + "loss": 0.0021, + "num_tokens": 36159617.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3966.0, + "completions/max_terminated_length": 3966.0, + "completions/mean_length": 1109.5859375, + "completions/mean_terminated_length": 994.86669921875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 8.620689655172415, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.23365837176809, + "kl": 0.13690185546875, + "learning_rate": 4.499342489029211e-06, + "loss": 0.0169, + "num_tokens": 36432716.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.84375, + "completions/max_length": 3889.0, + "completions/max_terminated_length": 3384.0, + "completions/mean_length": 1092.75, + "completions/mean_terminated_length": 981.0812377929688, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 8.689655172413794, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.16032757011050167, + "kl": 0.10394287109375, + "learning_rate": 4.490140002513449e-06, + "loss": 0.0146, + "num_tokens": 36701396.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 2425.0, + "completions/max_terminated_length": 2425.0, + "completions/mean_length": 1034.046875, + "completions/mean_terminated_length": 973.3114013671875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 8.758620689655173, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.10031944462051583, + "kl": 0.1348876953125, + "learning_rate": 4.48086450320833e-06, + "loss": 0.0099, + "num_tokens": 36964594.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 3574.0, + "completions/max_terminated_length": 3378.0, + "completions/mean_length": 1042.8125, + "completions/mean_terminated_length": 974.8524169921875, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 8.827586206896552, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.17832977194904587, + "kl": 0.1573486328125, + "learning_rate": 4.4715163802952266e-06, + "loss": -0.0023, + "num_tokens": 37229146.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3886.0, + "completions/max_terminated_length": 3886.0, + "completions/mean_length": 1122.5859375, + "completions/mean_terminated_length": 1023.791748046875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 8.89655172413793, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.159287714280421, + "kl": 0.15380859375, + "learning_rate": 4.462096026002655e-06, + "loss": 0.0082, + "num_tokens": 37503909.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 3549.0, + "completions/max_terminated_length": 3506.0, + "completions/mean_length": 1073.1015625, + "completions/mean_terminated_length": 993.8524169921875, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 8.96551724137931, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.1435096426454022, + "kl": 0.1278076171875, + "learning_rate": 4.4526038355898144e-06, + "loss": -0.0049, + "num_tokens": 37772338.0, + "reward": 0.0023437500931322575, + "reward_std": 0.004650149028748274, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 2773.0, + "completions/max_terminated_length": 2773.0, + "completions/mean_length": 1065.578125, + "completions/mean_terminated_length": 992.0743408203125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 9.068965517241379, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.157817499585136, + "kl": 0.17333984375, + "learning_rate": 4.4430402073300035e-06, + "loss": -0.0093, + "num_tokens": 38037804.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 3758.0, + "completions/max_terminated_length": 3386.0, + "completions/mean_length": 1141.515625, + "completions/mean_terminated_length": 1105.7381591796875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 9.137931034482758, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.18359050511600664, + "kl": 0.10345458984375, + "learning_rate": 4.433405542493909e-06, + "loss": 0.0256, + "num_tokens": 38314990.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 3627.0, + "completions/max_terminated_length": 3082.0, + "completions/mean_length": 1154.953125, + "completions/mean_terminated_length": 992.8782348632812, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 9.206896551724139, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.11399074662146205, + "kl": 0.1798095703125, + "learning_rate": 4.4237002453327734e-06, + "loss": 0.0039, + "num_tokens": 38593896.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 3829.0, + "completions/max_terminated_length": 3127.0, + "completions/mean_length": 1100.09375, + "completions/mean_terminated_length": 971.3933715820312, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 9.275862068965518, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.1709097015039483, + "kl": 0.15692138671875, + "learning_rate": 4.4139247230614245e-06, + "loss": -0.0009, + "num_tokens": 38865780.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 3712.0, + "completions/max_terminated_length": 3399.0, + "completions/mean_length": 1115.109375, + "completions/mean_terminated_length": 1002.6966552734375, + "completions/min_length": 25.0, + "completions/min_terminated_length": 25.0, + "epoch": 9.344827586206897, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.2622407919256445, + "kl": 0.145751953125, + "learning_rate": 4.404079385841201e-06, + "loss": 0.0066, + "num_tokens": 39138442.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3245.0, + "completions/max_terminated_length": 3245.0, + "completions/mean_length": 1110.4375, + "completions/mean_terminated_length": 1051.9434814453125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 9.413793103448276, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.13173411122101364, + "kl": 0.091552734375, + "learning_rate": 4.394164646762734e-06, + "loss": 0.0054, + "num_tokens": 39411418.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 3933.0, + "completions/max_terminated_length": 3087.0, + "completions/mean_length": 1342.8671875, + "completions/mean_terminated_length": 1057.9385986328125, + "completions/min_length": 23.0, + "completions/min_terminated_length": 23.0, + "epoch": 9.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.08021227471324485, + "kl": 0.158203125, + "learning_rate": 4.384180921828618e-06, + "loss": 0.0016, + "num_tokens": 39714377.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.40625, + "completions/max_length": 3816.0, + "completions/max_terminated_length": 2590.0, + "completions/mean_length": 1384.328125, + "completions/mean_terminated_length": 1038.9815673828125, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 9.551724137931034, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.41718872220840225, + "kl": 0.433349609375, + "learning_rate": 4.374128629935955e-06, + "loss": 0.0163, + "num_tokens": 40022643.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.84375, + "completions/max_length": 3622.0, + "completions/max_terminated_length": 3622.0, + "completions/mean_length": 1137.9921875, + "completions/mean_terminated_length": 1055.6990966796875, + "completions/min_length": 31.0, + "completions/min_terminated_length": 31.0, + "epoch": 9.620689655172415, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.16603761477760853, + "kl": 0.1663818359375, + "learning_rate": 4.364008192858781e-06, + "loss": 0.0016, + "num_tokens": 40298210.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.71875, + "completions/max_length": 3639.0, + "completions/max_terminated_length": 2873.0, + "completions/mean_length": 1100.7578125, + "completions/mean_terminated_length": 949.0588989257812, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 9.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12834721614969635, + "kl": 0.16253662109375, + "learning_rate": 4.353820035230366e-06, + "loss": 0.0016, + "num_tokens": 40570179.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 3675.0, + "completions/max_terminated_length": 2561.0, + "completions/mean_length": 1162.109375, + "completions/mean_terminated_length": 982.559326171875, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 9.758620689655173, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.42904089811800383, + "kl": 0.1427001953125, + "learning_rate": 4.3435645845254e-06, + "loss": 0.0204, + "num_tokens": 40850001.0, + "reward": 0.0023437500931322575, + "reward_std": 0.00662912568077445, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 2396.0, + "completions/max_terminated_length": 2396.0, + "completions/mean_length": 1024.2890625, + "completions/mean_terminated_length": 969.9425659179688, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 9.827586206896552, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.04803927651974637, + "kl": 0.13232421875, + "learning_rate": 4.333242271042054e-06, + "loss": 0.0013, + "num_tokens": 41111014.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 3613.0, + "completions/max_terminated_length": 2907.0, + "completions/mean_length": 1123.8203125, + "completions/mean_terminated_length": 1003.9586181640625, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 9.89655172413793, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.10662989148877, + "kl": 0.20062255859375, + "learning_rate": 4.32285352788393e-06, + "loss": 0.0274, + "num_tokens": 41385935.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3703.0, + "completions/max_terminated_length": 2656.0, + "completions/mean_length": 1096.5625, + "completions/mean_terminated_length": 962.5000610351562, + "completions/min_length": 36.0, + "completions/min_terminated_length": 36.0, + "epoch": 9.96551724137931, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.38374628240227976, + "kl": 0.775390625, + "learning_rate": 4.312398790941882e-06, + "loss": 0.023, + "num_tokens": 41656927.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.71875, + "completions/max_length": 3610.0, + "completions/max_terminated_length": 2677.0, + "completions/mean_length": 1232.203125, + "completions/mean_terminated_length": 1068.411865234375, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 10.068965517241379, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.1052339109689304, + "kl": 0.47564697265625, + "learning_rate": 4.301878498875735e-06, + "loss": 0.0308, + "num_tokens": 41945721.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.46875, + "completions/max_length": 3781.0, + "completions/max_terminated_length": 2872.0, + "completions/mean_length": 1234.3515625, + "completions/mean_terminated_length": 1016.5765991210938, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 10.137931034482758, + "frac_reward_zero_std": 0.9375, + "grad_norm": 35.928120765270634, + "kl": 14.375, + "learning_rate": 4.291293093095873e-06, + "loss": 0.1488, + "num_tokens": 42234790.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3320.0, + "completions/max_terminated_length": 3320.0, + "completions/mean_length": 1105.1953125, + "completions/mean_terminated_length": 1062.725830078125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 10.206896551724139, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.4112262946098, + "kl": 3.7413330078125, + "learning_rate": 4.280643017744723e-06, + "loss": 0.0374, + "num_tokens": 42507327.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 3702.0, + "completions/max_terminated_length": 3338.0, + "completions/mean_length": 996.4453125, + "completions/mean_terminated_length": 975.1417236328125, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 10.275862068965518, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.2040642472598806, + "kl": 0.04669189453125, + "learning_rate": 4.269928719678117e-06, + "loss": 0.0234, + "num_tokens": 42765504.0, + "reward": 0.004687500186264515, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 3703.0, + "completions/max_terminated_length": 2953.0, + "completions/mean_length": 1151.609375, + "completions/mean_terminated_length": 1033.728759765625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 10.344827586206897, + "frac_reward_zero_std": 0.875, + "grad_norm": 14.424156189088968, + "kl": 7.0703125, + "learning_rate": 4.2591506484465426e-06, + "loss": 0.0775, + "num_tokens": 43043982.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 3827.0, + "completions/max_terminated_length": 3827.0, + "completions/mean_length": 1085.0390625, + "completions/mean_terminated_length": 1022.5440673828125, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 10.413793103448276, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.597484250545503, + "kl": 1.0592041015625, + "learning_rate": 4.248309256276283e-06, + "loss": 0.0059, + "num_tokens": 43313035.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 3439.0, + "completions/max_terminated_length": 2552.0, + "completions/mean_length": 1064.8359375, + "completions/mean_terminated_length": 966.7294311523438, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 10.482758620689655, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8861787967943775, + "kl": 0.5126953125, + "learning_rate": 4.23740499805044e-06, + "loss": 0.0101, + "num_tokens": 43578070.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 3494.0, + "completions/max_terminated_length": 3494.0, + "completions/mean_length": 1208.5078125, + "completions/mean_terminated_length": 1023.4512939453125, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 10.551724137931034, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.4357968611893147, + "kl": 1.07177734375, + "learning_rate": 4.22643833128985e-06, + "loss": 0.0254, + "num_tokens": 43863831.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 3829.0, + "completions/max_terminated_length": 2697.0, + "completions/mean_length": 1155.1328125, + "completions/mean_terminated_length": 1037.631103515625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 10.620689655172415, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.22348726052808882, + "kl": 0.13751220703125, + "learning_rate": 4.215409716133885e-06, + "loss": 0.004, + "num_tokens": 44142760.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.71875, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 2652.0, + "completions/mean_length": 1111.03125, + "completions/mean_terminated_length": 1002.6470947265625, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 10.689655172413794, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7205943652484093, + "kl": 0.1025390625, + "learning_rate": 4.204319615321151e-06, + "loss": 0.0214, + "num_tokens": 44416044.0, + "reward": 0.00390625, + "reward_std": 0.011048542335629463, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 3084.0, + "completions/max_terminated_length": 2813.0, + "completions/mean_length": 1121.2578125, + "completions/mean_terminated_length": 1068.4031982421875, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 10.758620689655173, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7995609561077808, + "kl": 0.06732177734375, + "learning_rate": 4.193168494170065e-06, + "loss": 0.0238, + "num_tokens": 44690637.0, + "reward": 0.004687500186264515, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.71875, + "completions/max_length": 3615.0, + "completions/max_terminated_length": 3593.0, + "completions/mean_length": 1146.625, + "completions/mean_terminated_length": 1007.7227172851562, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 10.827586206896552, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.1171622175387835, + "kl": 0.1898193359375, + "learning_rate": 4.181956820559339e-06, + "loss": 0.0112, + "num_tokens": 44967149.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 3627.0, + "completions/max_terminated_length": 2590.0, + "completions/mean_length": 1204.2421875, + "completions/mean_terminated_length": 983.7368774414062, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 10.89655172413793, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.1873253186072465, + "kl": 0.203369140625, + "learning_rate": 4.170685064908342e-06, + "loss": 0.0186, + "num_tokens": 45252364.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.40625, + "completions/max_length": 3251.0, + "completions/max_terminated_length": 2182.0, + "completions/mean_length": 1139.265625, + "completions/mean_terminated_length": 929.5228881835938, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 10.96551724137931, + "frac_reward_zero_std": 0.8125, + "grad_norm": 3.724868026814391, + "kl": 0.2445068359375, + "learning_rate": 4.159353700157365e-06, + "loss": 0.0321, + "num_tokens": 45528118.0, + "reward": 0.004687500186264515, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 3597.0, + "completions/max_terminated_length": 3071.0, + "completions/mean_length": 1286.359375, + "completions/mean_terminated_length": 1075.72412109375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 11.068965517241379, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5905977263071814, + "kl": 0.44775390625, + "learning_rate": 4.14796320174778e-06, + "loss": 0.0148, + "num_tokens": 45823844.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 3898.0, + "completions/max_terminated_length": 3686.0, + "completions/mean_length": 1284.21875, + "completions/mean_terminated_length": 997.7344970703125, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 11.137931034482758, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6163712852564641, + "kl": 0.421630859375, + "learning_rate": 4.136514047602087e-06, + "loss": 0.017, + "num_tokens": 46118856.0, + "reward": 0.00390625, + "reward_std": 0.011048542335629463, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.84375, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 3128.0, + "completions/mean_length": 1169.3515625, + "completions/mean_terminated_length": 1072.76416015625, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 11.206896551724139, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2294368361263189, + "kl": 0.083740234375, + "learning_rate": 4.1250067181038635e-06, + "loss": 0.0414, + "num_tokens": 46399605.0, + "reward": 0.007031249813735485, + "reward_std": 0.019887376576662064, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3522.0, + "completions/max_terminated_length": 2752.0, + "completions/mean_length": 1179.953125, + "completions/mean_terminated_length": 1068.36669921875, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "epoch": 11.275862068965518, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5748097249061757, + "kl": 0.138427734375, + "learning_rate": 4.113441696077608e-06, + "loss": 0.0303, + "num_tokens": 46681711.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 3321.0, + "completions/max_terminated_length": 3321.0, + "completions/mean_length": 1342.6328125, + "completions/mean_terminated_length": 1189.296630859375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 11.344827586206897, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8058614856067299, + "kl": 0.1512451171875, + "learning_rate": 4.101819466768484e-06, + "loss": 0.0295, + "num_tokens": 46983472.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 3656.0, + "completions/max_terminated_length": 2724.0, + "completions/mean_length": 1227.3671875, + "completions/mean_terminated_length": 1120.6474609375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 11.413793103448276, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2566916575044885, + "kl": 0.13800048828125, + "learning_rate": 4.0901405178219535e-06, + "loss": 0.0504, + "num_tokens": 47271647.0, + "reward": 0.00937500037252903, + "reward_std": 0.02346404455602169, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 3262.0, + "completions/max_terminated_length": 3262.0, + "completions/mean_length": 1120.6875, + "completions/mean_terminated_length": 1103.9920654296875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 11.482758620689655, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.0736888172198473, + "kl": 0.048828125, + "learning_rate": 4.078405339263326e-06, + "loss": 0.0137, + "num_tokens": 47546167.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.1875, + "completions/max_length": 3549.0, + "completions/max_terminated_length": 2779.0, + "completions/mean_length": 1352.96875, + "completions/mean_terminated_length": 1016.921630859375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 11.551724137931034, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.6005923512486049, + "kl": 0.984375, + "learning_rate": 4.06661442347719e-06, + "loss": 0.0458, + "num_tokens": 47850419.0, + "reward": 0.0023437500931322575, + "reward_std": 0.00662912568077445, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 3155.0, + "completions/max_terminated_length": 3155.0, + "completions/mean_length": 1171.25, + "completions/mean_terminated_length": 1157.6220703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 11.620689655172415, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.14078818110253288, + "kl": 0.04827880859375, + "learning_rate": 4.054768265186758e-06, + "loss": 0.0359, + "num_tokens": 48131411.0, + "reward": 0.00390625, + "reward_std": 0.011048542335629463, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 3781.0, + "completions/max_terminated_length": 2452.0, + "completions/mean_length": 1255.859375, + "completions/mean_terminated_length": 1105.821533203125, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 11.689655172413794, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.2381277561820383, + "kl": 0.73291015625, + "learning_rate": 4.0428673614331036e-06, + "loss": 0.023, + "num_tokens": 48423233.0, + "reward": 0.0062500000931322575, + "reward_std": 0.014625209383666515, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.46875, + "completions/max_length": 3201.0, + "completions/max_terminated_length": 3201.0, + "completions/mean_length": 1275.1796875, + "completions/mean_terminated_length": 1040.189208984375, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 11.758620689655173, + "frac_reward_zero_std": 0.9375, + "grad_norm": 7.280137346077164, + "kl": 4.1153564453125, + "learning_rate": 4.030912211554316e-06, + "loss": 0.0174, + "num_tokens": 48717528.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 3781.0, + "completions/mean_length": 1331.5859375, + "completions/mean_terminated_length": 1059.8304443359375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 11.827586206896552, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.9744599773603109, + "kl": 2.4501953125, + "learning_rate": 4.018903317164539e-06, + "loss": 0.0407, + "num_tokens": 49019043.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.71875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2779.0, + "completions/mean_length": 1198.59375, + "completions/mean_terminated_length": 1045.2689208984375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 11.89655172413793, + "frac_reward_zero_std": 0.875, + "grad_norm": 4.653929117412743, + "kl": 2.8310546875, + "learning_rate": 4.006841182132932e-06, + "loss": 0.0342, + "num_tokens": 49300087.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.71875, + "completions/max_length": 3651.0, + "completions/max_terminated_length": 2915.0, + "completions/mean_length": 1265.8515625, + "completions/mean_terminated_length": 1117.411865234375, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 11.96551724137931, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6234316034794274, + "kl": 0.6668701171875, + "learning_rate": 3.9947263125625195e-06, + "loss": 0.0188, + "num_tokens": 49592092.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 3644.0, + "completions/max_terminated_length": 3259.0, + "completions/mean_length": 1238.4140625, + "completions/mean_terminated_length": 1073.966064453125, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 12.068965517241379, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.17122087218075646, + "kl": 0.2076416015625, + "learning_rate": 3.982559216768967e-06, + "loss": 0.0231, + "num_tokens": 49881681.0, + "reward": 0.00390625, + "reward_std": 0.009069565683603287, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2939.0, + "completions/mean_length": 1182.0234375, + "completions/mean_terminated_length": 1082.7850341796875, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 12.137931034482758, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7991821557743215, + "kl": 0.10546875, + "learning_rate": 3.970340405259245e-06, + "loss": 0.0297, + "num_tokens": 50162444.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 3125.0, + "completions/max_terminated_length": 2901.0, + "completions/mean_length": 1098.0625, + "completions/mean_terminated_length": 997.43798828125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 12.206896551724139, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.2841276215892951, + "kl": 0.1004638671875, + "learning_rate": 3.958070390710214e-06, + "loss": 0.0165, + "num_tokens": 50434068.0, + "reward": 0.0078125, + "reward_std": 0.022097084671258926, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 3582.0, + "completions/max_terminated_length": 3582.0, + "completions/mean_length": 1317.7734375, + "completions/mean_terminated_length": 1081.73681640625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 12.275862068965518, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.16105398069309368, + "kl": 0.3125, + "learning_rate": 3.945749687947109e-06, + "loss": 0.0182, + "num_tokens": 50732911.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.21875, + "completions/max_length": 3694.0, + "completions/max_terminated_length": 3050.0, + "completions/mean_length": 1322.953125, + "completions/mean_terminated_length": 1041.291259765625, + "completions/min_length": 32.0, + "completions/min_terminated_length": 32.0, + "epoch": 12.344827586206897, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.2090540532526456, + "kl": 0.37744140625, + "learning_rate": 3.933378813921942e-06, + "loss": 0.0267, + "num_tokens": 51033321.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.4375, + "completions/max_length": 3779.0, + "completions/max_terminated_length": 2966.0, + "completions/mean_length": 1344.234375, + "completions/mean_terminated_length": 1135.8726806640625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 12.413793103448276, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4311077408988136, + "kl": 0.284423828125, + "learning_rate": 3.920958287691811e-06, + "loss": 0.0218, + "num_tokens": 51336455.0, + "reward": 0.004687500186264515, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 3701.0, + "completions/max_terminated_length": 2957.0, + "completions/mean_length": 1109.2421875, + "completions/mean_terminated_length": 1054.152099609375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 12.482758620689655, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.12068220717626289, + "kl": 0.05767822265625, + "learning_rate": 3.908488630397121e-06, + "loss": 0.0053, + "num_tokens": 51608414.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 3180.0, + "completions/max_terminated_length": 2710.0, + "completions/mean_length": 1074.359375, + "completions/mean_terminated_length": 977.718994140625, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 12.551724137931034, + "frac_reward_zero_std": 0.5625, + "grad_norm": 3.1746304474076985, + "kl": 0.1571044921875, + "learning_rate": 3.8959703652397175e-06, + "loss": 0.0611, + "num_tokens": 51875836.0, + "reward": 0.01171875, + "reward_std": 0.031166650354862213, + "rewards/code_format_reward/mean": 0.0546875, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.24301259219646454, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 3597.0, + "completions/max_terminated_length": 2893.0, + "completions/mean_length": 1266.734375, + "completions/mean_terminated_length": 999.9464721679688, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 12.620689655172415, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.207418864003099, + "kl": 0.321044921875, + "learning_rate": 3.883404017460935e-06, + "loss": 0.0392, + "num_tokens": 52167906.0, + "reward": 0.007031250279396772, + "reward_std": 0.016834918409585953, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 3471.0, + "completions/mean_length": 1417.796875, + "completions/mean_terminated_length": 1209.271240234375, + "completions/min_length": 33.0, + "completions/min_terminated_length": 33.0, + "epoch": 12.689655172413794, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.09955903326233259, + "kl": 0.176025390625, + "learning_rate": 3.870790114319559e-06, + "loss": 0.0072, + "num_tokens": 52480456.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 3968.0, + "completions/max_terminated_length": 3248.0, + "completions/mean_length": 1108.2109375, + "completions/mean_terminated_length": 1078.6270751953125, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 12.758620689655173, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.24560735641594267, + "kl": 0.0670166015625, + "learning_rate": 3.858129185069701e-06, + "loss": 0.0139, + "num_tokens": 52753379.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 3961.0, + "completions/max_terminated_length": 3153.0, + "completions/mean_length": 1337.1640625, + "completions/mean_terminated_length": 1102.4866943359375, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 12.827586206896552, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.6007499005127412, + "kl": 0.50238037109375, + "learning_rate": 3.845421760938597e-06, + "loss": 0.0465, + "num_tokens": 53055376.0, + "reward": 0.007031249813735485, + "reward_std": 0.017908399924635887, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 3639.0, + "completions/max_terminated_length": 3604.0, + "completions/mean_length": 1307.8125, + "completions/mean_terminated_length": 1093.5, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 12.89655172413793, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.2714273921718127, + "kl": 0.23095703125, + "learning_rate": 3.832668375104312e-06, + "loss": 0.0268, + "num_tokens": 53353848.0, + "reward": 0.00546875037252903, + "reward_std": 0.015467960387468338, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 3364.0, + "completions/mean_length": 1279.1484375, + "completions/mean_terminated_length": 1054.80859375, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 12.96551724137931, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.36704943028353426, + "kl": 0.5118408203125, + "learning_rate": 3.8198695626733725e-06, + "loss": 0.0337, + "num_tokens": 53648651.0, + "reward": 0.004687500186264515, + "reward_std": 0.010205792263150215, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 3973.0, + "completions/max_terminated_length": 3010.0, + "completions/mean_length": 1346.34375, + "completions/mean_terminated_length": 1185.033935546875, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 13.068965517241379, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.1481876574798495, + "kl": 0.763671875, + "learning_rate": 3.8070258606583156e-06, + "loss": 0.0321, + "num_tokens": 53952055.0, + "reward": 0.007031250279396772, + "reward_std": 0.013782460242509842, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.40625, + "completions/max_length": 3703.0, + "completions/max_terminated_length": 3643.0, + "completions/mean_length": 1286.375, + "completions/mean_terminated_length": 1048.99072265625, + "completions/min_length": 38.0, + "completions/min_terminated_length": 38.0, + "epoch": 13.137931034482758, + "frac_reward_zero_std": 0.625, + "grad_norm": 2.673552770064533, + "kl": 1.23779296875, + "learning_rate": 3.7941378079551544e-06, + "loss": 0.0481, + "num_tokens": 54247783.0, + "reward": 0.01328125037252903, + "reward_std": 0.027723699808120728, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.53125, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 2962.0, + "completions/mean_length": 1398.671875, + "completions/mean_terminated_length": 1144.3450927734375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 13.206896551724139, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5552309526375896, + "kl": 0.97216796875, + "learning_rate": 3.7812059453207677e-06, + "loss": 0.0689, + "num_tokens": 54557885.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.40625, + "completions/max_length": 3720.0, + "completions/max_terminated_length": 3228.0, + "completions/mean_length": 1557.703125, + "completions/mean_terminated_length": 1227.1099853515625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 13.275862068965518, + "frac_reward_zero_std": 0.875, + "grad_norm": 9.1078368022765, + "kl": 5.70703125, + "learning_rate": 3.768230815350213e-06, + "loss": 0.0633, + "num_tokens": 54888343.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3875.0, + "completions/mean_length": 1598.5390625, + "completions/mean_terminated_length": 1207.463623046875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 13.344827586206897, + "frac_reward_zero_std": 0.9375, + "grad_norm": 31.899363924236727, + "kl": 20.453125, + "learning_rate": 3.7552129624539557e-06, + "loss": 0.2114, + "num_tokens": 55221692.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 3001.0, + "completions/mean_length": 1382.3046875, + "completions/mean_terminated_length": 1149.27587890625, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 13.413793103448276, + "frac_reward_zero_std": 0.75, + "grad_norm": 8.608620637191756, + "kl": 5.47119140625, + "learning_rate": 3.7421529328350316e-06, + "loss": 0.0755, + "num_tokens": 55529259.0, + "reward": 0.00546875037252903, + "reward_std": 0.015467960387468338, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.09375, + "completions/max_length": 3634.0, + "completions/max_terminated_length": 3476.0, + "completions/mean_length": 1699.5859375, + "completions/mean_terminated_length": 1204.757568359375, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 13.482758620689655, + "frac_reward_zero_std": 0.6875, + "grad_norm": 21.27014366022027, + "kl": 14.7578125, + "learning_rate": 3.7290512744661274e-06, + "loss": 0.1867, + "num_tokens": 55877878.0, + "reward": 0.008593750186264515, + "reward_std": 0.02125433459877968, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.78125, + "completions/max_length": 3644.0, + "completions/max_terminated_length": 3129.0, + "completions/mean_length": 1274.1328125, + "completions/mean_terminated_length": 1156.4627685546875, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 13.551724137931034, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6348817319536253, + "kl": 0.481689453125, + "learning_rate": 3.715908537066589e-06, + "loss": 0.0188, + "num_tokens": 56170895.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.59375, + "completions/max_length": 3876.0, + "completions/max_terminated_length": 3179.0, + "completions/mean_length": 1417.875, + "completions/mean_terminated_length": 1151.973876953125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 13.620689655172415, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.8195574686928588, + "kl": 3.0634765625, + "learning_rate": 3.7027252720793538e-06, + "loss": 0.0722, + "num_tokens": 56483455.0, + "reward": 0.00937500037252903, + "reward_std": 0.0265165027230978, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3445.0, + "completions/mean_length": 1423.90625, + "completions/mean_terminated_length": 1207.26318359375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.689655172413794, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.4766881349959924, + "kl": 1.093017578125, + "learning_rate": 3.689502032647817e-06, + "loss": 0.0662, + "num_tokens": 56795883.0, + "reward": 0.01015624962747097, + "reward_std": 0.028726210817694664, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.22826264798641205, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.4375, + "completions/max_length": 3701.0, + "completions/max_terminated_length": 3503.0, + "completions/mean_length": 1477.015625, + "completions/mean_terminated_length": 1120.2908935546875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 13.758620689655173, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7111623568121386, + "kl": 0.60986328125, + "learning_rate": 3.6762393735926245e-06, + "loss": 0.0385, + "num_tokens": 57116013.0, + "reward": 0.004687500186264515, + "reward_std": 0.0132582513615489, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.65625, + "completions/max_length": 4078.0, + "completions/max_terminated_length": 2991.0, + "completions/mean_length": 1320.6796875, + "completions/mean_terminated_length": 1133.3504638671875, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 13.827586206896552, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4078028037583223, + "kl": 0.148681640625, + "learning_rate": 3.6629378513883852e-06, + "loss": -0.002, + "num_tokens": 57415036.0, + "reward": 0.0078125, + "reward_std": 0.019044626504182816, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.21875, + "completions/max_length": 3966.0, + "completions/max_terminated_length": 2967.0, + "completions/mean_length": 1565.1640625, + "completions/mean_terminated_length": 1047.116455078125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 13.89655172413793, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.23178510046600032, + "kl": 0.50244140625, + "learning_rate": 3.6495980241403307e-06, + "loss": 0.0283, + "num_tokens": 57746449.0, + "reward": 0.0054687499068677425, + "reward_std": 0.015467959456145763, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3301.0, + "completions/mean_length": 1459.15625, + "completions/mean_terminated_length": 1147.72802734375, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 13.96551724137931, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2444494445748395, + "kl": 0.1614990234375, + "learning_rate": 3.636220451560896e-06, + "loss": 0.0406, + "num_tokens": 58064061.0, + "reward": 0.00937500037252903, + "reward_std": 0.0265165027230978, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2865.0, + "completions/mean_length": 1537.328125, + "completions/mean_terminated_length": 1081.9056396484375, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 14.068965517241379, + "frac_reward_zero_std": 0.625, + "grad_norm": 4.202574842240394, + "kl": 0.343017578125, + "learning_rate": 3.622805694946235e-06, + "loss": 0.0732, + "num_tokens": 58391007.0, + "reward": 0.008593750186264515, + "reward_std": 0.02430679462850094, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.15625, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 3439.0, + "completions/mean_length": 1681.4765625, + "completions/mean_terminated_length": 1135.1881103515625, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 14.137931034482758, + "frac_reward_zero_std": 0.75, + "grad_norm": 2.849201175527445, + "kl": 0.3662109375, + "learning_rate": 3.609354317152667e-06, + "loss": 0.0143, + "num_tokens": 58737308.0, + "reward": 0.007031249813735485, + "reward_std": 0.016834918409585953, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.375, + "completions/max_length": 3639.0, + "completions/max_terminated_length": 3445.0, + "completions/mean_length": 1557.609375, + "completions/mean_terminated_length": 1218.6851806640625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 14.206896551724139, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.25750281797892505, + "kl": 0.397705078125, + "learning_rate": 3.595866882573063e-06, + "loss": 0.0404, + "num_tokens": 59067754.0, + "reward": 0.0078125, + "reward_std": 0.022097084671258926, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3679.0, + "completions/mean_length": 1568.7890625, + "completions/mean_terminated_length": 1195.89990234375, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 14.275862068965518, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5981729275912904, + "kl": 0.2265625, + "learning_rate": 3.5823439571131675e-06, + "loss": 0.0397, + "num_tokens": 59395991.0, + "reward": 0.00937500037252903, + "reward_std": 0.0265165027230978, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.0625, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 3035.0, + "completions/mean_length": 1803.53125, + "completions/mean_terminated_length": 1195.30615234375, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 14.344827586206897, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.135149590775422, + "kl": 0.8017578125, + "learning_rate": 3.5687861081678477e-06, + "loss": 0.0678, + "num_tokens": 59757915.0, + "reward": 0.0078125, + "reward_std": 0.022097084671258926, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.96875, + "completions/max_length": 3898.0, + "completions/max_terminated_length": 3719.0, + "completions/mean_length": 1811.453125, + "completions/mean_terminated_length": 1174.9473876953125, + "completions/min_length": 54.0, + "completions/min_terminated_length": 54.0, + "epoch": 14.413793103448276, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.040721248461896, + "kl": 0.904052734375, + "learning_rate": 3.555193904597291e-06, + "loss": 0.0588, + "num_tokens": 60120853.0, + "reward": 0.0078125, + "reward_std": 0.015992168337106705, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.09375, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 3369.0, + "completions/mean_length": 1775.4609375, + "completions/mean_terminated_length": 1164.0, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 14.482758620689655, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3439397685636572, + "kl": 0.317626953125, + "learning_rate": 3.541567916703138e-06, + "loss": 0.0157, + "num_tokens": 60479184.0, + "reward": 0.00390625, + "reward_std": 0.011048542335629463, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.0, + "completions/max_length": 3968.0, + "completions/max_terminated_length": 3484.0, + "completions/mean_length": 1820.671875, + "completions/mean_terminated_length": 1123.135498046875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 14.551724137931034, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.2633427938950654, + "kl": 0.63671875, + "learning_rate": 3.5279087162045517e-06, + "loss": 0.0301, + "num_tokens": 60843302.0, + "reward": 0.0078125, + "reward_std": 0.019044626504182816, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.09375, + "completions/max_length": 3779.0, + "completions/max_terminated_length": 3535.0, + "completions/mean_length": 1807.640625, + "completions/mean_terminated_length": 1250.888916015625, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 14.620689655172415, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5078428161832284, + "kl": 0.7353515625, + "learning_rate": 3.5142168762142265e-06, + "loss": 0.0156, + "num_tokens": 61205752.0, + "reward": 0.004687500186264515, + "reward_std": 0.010205792263150215, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.78125, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 3727.0, + "completions/mean_length": 2058.3828125, + "completions/mean_terminated_length": 1210.6292724609375, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 14.689655172413794, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.5381994511295285, + "kl": 1.01220703125, + "learning_rate": 3.500492971214347e-06, + "loss": 0.0944, + "num_tokens": 61600297.0, + "reward": 0.010937499813735485, + "reward_std": 0.024831002578139305, + "rewards/code_format_reward/mean": 0.0546875, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.22826264798641205, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.6875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3662.0, + "completions/mean_length": 2038.375, + "completions/mean_terminated_length": 1164.9766845703125, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 14.758620689655173, + "frac_reward_zero_std": 0.5, + "grad_norm": 3.2511400903390695, + "kl": 1.201171875, + "learning_rate": 3.48673757703248e-06, + "loss": 0.1018, + "num_tokens": 61991113.0, + "reward": 0.014062500558793545, + "reward_std": 0.036722294986248016, + "rewards/code_format_reward/mean": 0.0703125, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.6875, + "completions/max_length": 3760.0, + "completions/max_terminated_length": 3302.0, + "completions/mean_length": 1885.109375, + "completions/mean_terminated_length": 1156.837158203125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 14.827586206896552, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.492459393954852, + "kl": 0.744140625, + "learning_rate": 3.472951270817418e-06, + "loss": 0.0914, + "num_tokens": 62363479.0, + "reward": 0.01640624925494194, + "reward_std": 0.03724650293588638, + "rewards/code_format_reward/mean": 0.078125, + "rewards/code_format_reward/std": 0.2694226801395416, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.65625, + "completions/max_length": 3964.0, + "completions/max_terminated_length": 3438.0, + "completions/mean_length": 1984.21875, + "completions/mean_terminated_length": 1081.0118408203125, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 14.89655172413793, + "frac_reward_zero_std": 0.625, + "grad_norm": 4.7008931014840245, + "kl": 1.0595703125, + "learning_rate": 3.4591346310149578e-06, + "loss": 0.1139, + "num_tokens": 62748531.0, + "reward": 0.010937499813735485, + "reward_std": 0.026977965608239174, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.24301259219646454, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.75, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 3370.0, + "completions/mean_length": 2012.0390625, + "completions/mean_terminated_length": 1207.4659423828125, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 14.96551724137931, + "frac_reward_zero_std": 0.4375, + "grad_norm": 2.9389612842832147, + "kl": 4.7158203125, + "learning_rate": 3.445288237343632e-06, + "loss": 0.206, + "num_tokens": 63137144.0, + "reward": 0.02734375, + "reward_std": 0.048597924411296844, + "rewards/code_format_reward/mean": 0.1328125, + "rewards/code_format_reward/std": 0.3407054841518402, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3490002751350403, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.09375, + "completions/max_length": 3966.0, + "completions/max_terminated_length": 2764.0, + "completions/mean_length": 2428.8046875, + "completions/mean_terminated_length": 1306.1641845703125, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 15.068965517241379, + "frac_reward_zero_std": 0.625, + "grad_norm": 4.45397792495567, + "kl": 4.40234375, + "learning_rate": 3.4314126707703895e-06, + "loss": 0.1195, + "num_tokens": 63579103.0, + "reward": 0.008593750186264515, + "reward_std": 0.02430679276585579, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3495.0, + "completions/mean_length": 1989.8828125, + "completions/mean_terminated_length": 1206.3145751953125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 15.137931034482758, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.5710707568898774, + "kl": 2.033203125, + "learning_rate": 3.4175085134862128e-06, + "loss": 0.0981, + "num_tokens": 63962616.0, + "reward": 0.01328125037252903, + "reward_std": 0.029189826920628548, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.65625, + "completions/max_length": 4078.0, + "completions/max_terminated_length": 3478.0, + "completions/mean_length": 1967.8125, + "completions/mean_terminated_length": 1057.5765380859375, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 15.206896551724139, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.31964770163417183, + "kl": 1.7822265625, + "learning_rate": 3.4035763488816953e-06, + "loss": 0.0976, + "num_tokens": 64345568.0, + "reward": 0.0062500000931322575, + "reward_std": 0.0176776684820652, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.34375, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 3514.0, + "completions/mean_length": 2182.1640625, + "completions/mean_terminated_length": 1087.0933837890625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 15.275862068965518, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0092039228909133, + "kl": 4.09375, + "learning_rate": 3.3896167615225594e-06, + "loss": 0.1554, + "num_tokens": 64755957.0, + "reward": 0.015625, + "reward_std": 0.03808925300836563, + "rewards/code_format_reward/mean": 0.078125, + "rewards/code_format_reward/std": 0.2694226801395416, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.2694226801395416, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.59375, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 3093.0, + "completions/mean_length": 1987.40625, + "completions/mean_terminated_length": 1125.795166015625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 15.344827586206897, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.9154515107257004, + "kl": 1.12060546875, + "learning_rate": 3.375630337125133e-06, + "loss": 0.1725, + "num_tokens": 65141417.0, + "reward": 0.02656250074505806, + "reward_std": 0.052394941449165344, + "rewards/code_format_reward/mean": 0.1328125, + "rewards/code_format_reward/std": 0.3407054841518402, + "rewards/format_reward/mean": 0.1328125, + "rewards/format_reward/std": 0.3407054841518402, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.59375, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 3141.0, + "completions/mean_length": 2030.7265625, + "completions/mean_terminated_length": 1072.77099609375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 15.413793103448276, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.8370610521795743, + "kl": 2.37890625, + "learning_rate": 3.361617662531772e-06, + "loss": 0.1042, + "num_tokens": 65532422.0, + "reward": 0.01875000074505806, + "reward_std": 0.04082317277789116, + "rewards/code_format_reward/mean": 0.09375, + "rewards/code_format_reward/std": 0.29262590408325195, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29262590408325195, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3118.0, + "completions/mean_length": 2137.3359375, + "completions/mean_terminated_length": 1143.5316162109375, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 15.482758620689655, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.25557061574182, + "kl": 1.6552734375, + "learning_rate": 3.347579325686237e-06, + "loss": 0.1034, + "num_tokens": 65936169.0, + "reward": 0.012500000186264515, + "reward_std": 0.031241057440638542, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.2694226801395416, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3282.0, + "completions/mean_length": 2148.4140625, + "completions/mean_terminated_length": 1096.0506591796875, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 15.551724137931034, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.8263881956490176, + "kl": 2.603515625, + "learning_rate": 3.333515915609027e-06, + "loss": 0.1221, + "num_tokens": 66341094.0, + "reward": 0.015625, + "reward_std": 0.03413129970431328, + "rewards/code_format_reward/mean": 0.0703125, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.78125, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 3126.0, + "completions/mean_length": 1835.375, + "completions/mean_terminated_length": 924.03369140625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 15.620689655172415, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5981599420132299, + "kl": 2.189453125, + "learning_rate": 3.3194280223726616e-06, + "loss": 0.1519, + "num_tokens": 66707094.0, + "reward": 0.01953125, + "reward_std": 0.0434223935008049, + "rewards/code_format_reward/mean": 0.09375, + "rewards/code_format_reward/std": 0.29262590408325195, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.3032590448856354, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.75, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3533.0, + "completions/mean_length": 1926.1171875, + "completions/mean_terminated_length": 1125.227294921875, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 15.689655172413794, + "frac_reward_zero_std": 0.375, + "grad_norm": 2.4515527014509053, + "kl": 1.6328125, + "learning_rate": 3.305316237076927e-06, + "loss": 0.1069, + "num_tokens": 67083541.0, + "reward": 0.015625, + "reward_std": 0.04114171117544174, + "rewards/code_format_reward/mean": 0.0703125, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.53125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3149.0, + "completions/mean_length": 2122.3125, + "completions/mean_terminated_length": 1097.654296875, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 15.758620689655173, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.171755009564972, + "kl": 2.701171875, + "learning_rate": 3.291181151824071e-06, + "loss": 0.12, + "num_tokens": 67485829.0, + "reward": 0.01250000111758709, + "reward_std": 0.033376358449459076, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.2694226801395416, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.96875, + "completions/max_length": 3989.0, + "completions/max_terminated_length": 3089.0, + "completions/mean_length": 1813.6875, + "completions/mean_terminated_length": 1124.4947509765625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 15.827586206896552, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5046649652436593, + "kl": 2.0888671875, + "learning_rate": 3.27702335969396e-06, + "loss": 0.11, + "num_tokens": 67849053.0, + "reward": 0.02421875111758709, + "reward_std": 0.045765817165374756, + "rewards/code_format_reward/mean": 0.1171875, + "rewards/code_format_reward/std": 0.322907418012619, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3320184051990509, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.6875, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 3391.0, + "completions/mean_length": 1980.9765625, + "completions/mean_terminated_length": 1105.8836669921875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 15.89655172413793, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.7928131225427445, + "kl": 2.44140625, + "learning_rate": 3.2628434547191985e-06, + "loss": 0.1144, + "num_tokens": 68233690.0, + "reward": 0.01718750037252903, + "reward_std": 0.042055923491716385, + "rewards/code_format_reward/mean": 0.0703125, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.3032590448856354, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.4375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3099.0, + "completions/mean_length": 2097.0, + "completions/mean_terminated_length": 961.4871826171875, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 15.96551724137931, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0489042713154726, + "kl": 2.595703125, + "learning_rate": 3.2486420318601973e-06, + "loss": 0.148, + "num_tokens": 68632946.0, + "reward": 0.02109375223517418, + "reward_std": 0.03914884477853775, + "rewards/code_format_reward/mean": 0.0859375, + "rewards/code_format_reward/std": 0.2813730239868164, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3320184051990509, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.8125, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 3331.0, + "completions/mean_length": 1801.9296875, + "completions/mean_terminated_length": 960.8778076171875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 16.06896551724138, + "frac_reward_zero_std": 0.4375, + "grad_norm": 4.154592071814475, + "kl": 1.04931640625, + "learning_rate": 3.2344196869802187e-06, + "loss": 0.1281, + "num_tokens": 68994665.0, + "reward": 0.01875000074505806, + "reward_std": 0.04007909819483757, + "rewards/code_format_reward/mean": 0.078125, + "rewards/code_format_reward/std": 0.2694226801395416, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31333550810813904, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.875, + "completions/max_length": 3865.0, + "completions/max_terminated_length": 3621.0, + "completions/mean_length": 1865.6015625, + "completions/mean_terminated_length": 1114.8370361328125, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 16.137931034482758, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.3591517303477938, + "kl": 1.84765625, + "learning_rate": 3.2201770168203694e-06, + "loss": 0.1793, + "num_tokens": 69364534.0, + "reward": 0.02656250074505806, + "reward_std": 0.05918382853269577, + "rewards/code_format_reward/mean": 0.1328125, + "rewards/code_format_reward/std": 0.3407054841518402, + "rewards/format_reward/mean": 0.1328125, + "rewards/format_reward/std": 0.3407054841518402, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.4375, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 3580.0, + "completions/mean_length": 2076.515625, + "completions/mean_terminated_length": 942.3846435546875, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 16.20689655172414, + "frac_reward_zero_std": 0.1875, + "grad_norm": 0.806887397746818, + "kl": 5.0390625, + "learning_rate": 3.205914618974563e-06, + "loss": 0.25, + "num_tokens": 69761400.0, + "reward": 0.02812499925494194, + "reward_std": 0.059036217629909515, + "rewards/code_format_reward/mean": 0.1171875, + "rewards/code_format_reward/std": 0.322907418012619, + "rewards/format_reward/mean": 0.1640625, + "rewards/format_reward/std": 0.371787428855896, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.3125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3545.0, + "completions/mean_length": 2223.921875, + "completions/mean_terminated_length": 1006.7838134765625, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 16.275862068965516, + "frac_reward_zero_std": 0.5, + "grad_norm": 35.47745775038266, + "kl": 26.03125, + "learning_rate": 3.1916330918644496e-06, + "loss": 0.4003, + "num_tokens": 70176038.0, + "reward": 0.01953125, + "reward_std": 0.038390956819057465, + "rewards/code_format_reward/mean": 0.0859375, + "rewards/code_format_reward/std": 0.2813730239868164, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31333550810813904, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.34375, + "completions/max_length": 3898.0, + "completions/max_terminated_length": 2502.0, + "completions/mean_length": 2152.0859375, + "completions/mean_terminated_length": 1088.0267333984375, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 16.344827586206897, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.2503280834421062, + "kl": 2.466796875, + "learning_rate": 3.177333034714303e-06, + "loss": 0.1744, + "num_tokens": 70582577.0, + "reward": 0.01796874962747097, + "reward_std": 0.042448077350854874, + "rewards/code_format_reward/mean": 0.078125, + "rewards/code_format_reward/std": 0.2694226801395416, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.3032590448856354, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.21875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3593.0, + "completions/mean_length": 2271.9375, + "completions/mean_terminated_length": 950.7464599609375, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 16.413793103448278, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.602351826361304, + "kl": 6.76953125, + "learning_rate": 3.1630150475258813e-06, + "loss": 0.2266, + "num_tokens": 71003289.0, + "reward": 0.02265625074505806, + "reward_std": 0.05599765479564667, + "rewards/code_format_reward/mean": 0.109375, + "rewards/code_format_reward/std": 0.31333550810813904, + "rewards/format_reward/mean": 0.1171875, + "rewards/format_reward/std": 0.322907418012619, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3756.0, + "completions/mean_length": 2153.6953125, + "completions/mean_terminated_length": 891.8289794921875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 16.482758620689655, + "frac_reward_zero_std": 0.25, + "grad_norm": 1.070332411448895, + "kl": 3.322265625, + "learning_rate": 3.148679731053252e-06, + "loss": 0.2088, + "num_tokens": 71409802.0, + "reward": 0.02031249925494194, + "reward_std": 0.046327732503414154, + "rewards/code_format_reward/mean": 0.0703125, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.1328125, + "rewards/format_reward/std": 0.3407054841518402, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2868.0, + "completions/mean_length": 2141.875, + "completions/mean_terminated_length": 917.4473876953125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 16.551724137931036, + "frac_reward_zero_std": 0.4375, + "grad_norm": 5.424674912817685, + "kl": 6.75, + "learning_rate": 3.1343276867775805e-06, + "loss": 0.1729, + "num_tokens": 71814130.0, + "reward": 0.01953125186264515, + "reward_std": 0.03921514004468918, + "rewards/code_format_reward/mean": 0.0859375, + "rewards/code_format_reward/std": 0.2813730239868164, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31333550810813904, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2442.0, + "completions/mean_length": 2093.890625, + "completions/mean_terminated_length": 907.3125, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 16.620689655172413, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.374418133895407, + "kl": 2.6484375, + "learning_rate": 3.1199595168819043e-06, + "loss": 0.1271, + "num_tokens": 72212076.0, + "reward": 0.01718750037252903, + "reward_std": 0.04312940686941147, + "rewards/code_format_reward/mean": 0.0546875, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.1171875, + "rewards/format_reward/std": 0.322907418012619, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.40625, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 2976.0, + "completions/mean_length": 2016.7421875, + "completions/mean_terminated_length": 873.5584106445312, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 16.689655172413794, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.8924129745454485, + "kl": 1.4404296875, + "learning_rate": 3.105575824225852e-06, + "loss": 0.1844, + "num_tokens": 72601291.0, + "reward": 0.02500000037252903, + "reward_std": 0.05301665514707565, + "rewards/code_format_reward/mean": 0.1015625, + "rewards/code_format_reward/std": 0.3032590448856354, + "rewards/format_reward/mean": 0.1484375, + "rewards/format_reward/std": 0.356930136680603, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.53125, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 2321.0, + "completions/mean_length": 1980.5859375, + "completions/mean_terminated_length": 908.4815063476562, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 16.75862068965517, + "frac_reward_zero_std": 0.375, + "grad_norm": 1.4340625784111272, + "kl": 1.90234375, + "learning_rate": 3.091177212320363e-06, + "loss": 0.1097, + "num_tokens": 72985878.0, + "reward": 0.01640624925494194, + "reward_std": 0.03679375723004341, + "rewards/code_format_reward/mean": 0.0546875, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31333550810813904, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.28125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3397.0, + "completions/mean_length": 2147.96875, + "completions/mean_terminated_length": 995.643798828125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 16.82758620689655, + "frac_reward_zero_std": 0.5, + "grad_norm": 2.7268618682978207, + "kl": 1.09375, + "learning_rate": 3.0767642853023538e-06, + "loss": 0.1017, + "num_tokens": 73390282.0, + "reward": 0.01953125, + "reward_std": 0.035612352192401886, + "rewards/code_format_reward/mean": 0.078125, + "rewards/code_format_reward/std": 0.2694226801395416, + "rewards/format_reward/mean": 0.1171875, + "rewards/format_reward/std": 0.322907418012619, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.21875, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 2508.0, + "completions/mean_length": 2277.1484375, + "completions/mean_terminated_length": 1013.3943481445312, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 16.896551724137932, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5995736918071287, + "kl": 1.7578125, + "learning_rate": 3.062337647909376e-06, + "loss": 0.1374, + "num_tokens": 73812829.0, + "reward": 0.01093750074505806, + "reward_std": 0.026821641251444817, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.40625, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 3600.0, + "completions/mean_length": 2105.015625, + "completions/mean_terminated_length": 944.467529296875, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 16.96551724137931, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.857143231853844, + "kl": 5.6171875, + "learning_rate": 3.04789790545424e-06, + "loss": 0.1826, + "num_tokens": 74213343.0, + "reward": 0.01171875, + "reward_std": 0.02766144648194313, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29262590408325195, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.4375, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 2771.0, + "completions/mean_length": 1964.4296875, + "completions/mean_terminated_length": 838.923095703125, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 17.06896551724138, + "frac_reward_zero_std": 0.375, + "grad_norm": 4.017797181216476, + "kl": 6.318359375, + "learning_rate": 3.033445663799621e-06, + "loss": 0.2199, + "num_tokens": 74595862.0, + "reward": 0.02031249925494194, + "reward_std": 0.044045768678188324, + "rewards/code_format_reward/mean": 0.0859375, + "rewards/code_format_reward/std": 0.2813730239868164, + "rewards/format_reward/mean": 0.1171875, + "rewards/format_reward/std": 0.322907418012619, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.25, + "completions/max_length": 3966.0, + "completions/max_terminated_length": 3177.0, + "completions/mean_length": 2182.3359375, + "completions/mean_terminated_length": 938.0972290039062, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 17.137931034482758, + "frac_reward_zero_std": 0.4375, + "grad_norm": 3.343975298515327, + "kl": 4.91796875, + "learning_rate": 3.018981529332633e-06, + "loss": 0.1409, + "num_tokens": 75006273.0, + "reward": 0.015625, + "reward_std": 0.0354263111948967, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29262590408325195, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.40625, + "completions/max_length": 3984.0, + "completions/max_terminated_length": 2902.0, + "completions/mean_length": 2097.34375, + "completions/mean_terminated_length": 961.1428833007812, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 17.20689655172414, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.6967963330727057, + "kl": 2.060546875, + "learning_rate": 3.00450610893939e-06, + "loss": 0.1658, + "num_tokens": 75405805.0, + "reward": 0.01640625111758709, + "reward_std": 0.037123169749975204, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.3032590448856354, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.375, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 3535.0, + "completions/mean_length": 2127.8203125, + "completions/mean_terminated_length": 835.5394897460938, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 17.275862068965516, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.7450705734420142, + "kl": 2.42041015625, + "learning_rate": 2.9900200099795396e-06, + "loss": 0.1491, + "num_tokens": 75809238.0, + "reward": 0.01484375074505806, + "reward_std": 0.02863791026175022, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.46875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3596.0, + "completions/mean_length": 2129.953125, + "completions/mean_terminated_length": 1060.5570068359375, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 17.344827586206897, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.2848423270886091, + "kl": 1.453125, + "learning_rate": 2.9755238402607826e-06, + "loss": 0.1156, + "num_tokens": 76210872.0, + "reward": 0.01484375074505806, + "reward_std": 0.0329950749874115, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31333550810813904, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.3125, + "completions/max_length": 3848.0, + "completions/max_terminated_length": 3738.0, + "completions/mean_length": 1911.796875, + "completions/mean_terminated_length": 846.108154296875, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 17.413793103448278, + "frac_reward_zero_std": 0.5, + "grad_norm": 4.043571004615006, + "kl": 4.5546875, + "learning_rate": 2.961018208013367e-06, + "loss": 0.1418, + "num_tokens": 76586654.0, + "reward": 0.015625, + "reward_std": 0.032934483140707016, + "rewards/code_format_reward/mean": 0.0546875, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.3032590448856354, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.78125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3070.0, + "completions/mean_length": 1698.1171875, + "completions/mean_terminated_length": 861.528076171875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 17.482758620689655, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.2520289626867145, + "kl": 2.818359375, + "learning_rate": 2.9465037218645694e-06, + "loss": 0.1809, + "num_tokens": 76933941.0, + "reward": 0.014843749813735485, + "reward_std": 0.03893200308084488, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.109375, + "rewards/format_reward/std": 0.31333550810813904, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.6875, + "completions/max_length": 3886.0, + "completions/max_terminated_length": 3092.0, + "completions/mean_length": 1711.015625, + "completions/mean_terminated_length": 722.279052734375, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 17.551724137931036, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.386808868017716, + "kl": 4.33203125, + "learning_rate": 2.9319809908131604e-06, + "loss": 0.2472, + "num_tokens": 77284023.0, + "reward": 0.02031250111758709, + "reward_std": 0.04664548113942146, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3490002751350403, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.59375, + "completions/max_length": 3933.0, + "completions/max_terminated_length": 2892.0, + "completions/mean_length": 1690.3125, + "completions/mean_terminated_length": 672.7830810546875, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 17.620689655172413, + "frac_reward_zero_std": 0.1875, + "grad_norm": 5.6917441385024565, + "kl": 4.3447265625, + "learning_rate": 2.917450624203847e-06, + "loss": 0.1689, + "num_tokens": 77631455.0, + "reward": 0.01953125, + "reward_std": 0.04777955263853073, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.1328125, + "rewards/format_reward/std": 0.3407054841518402, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.59375, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 3440.0, + "completions/mean_length": 1811.625, + "completions/mean_terminated_length": 722.795166015625, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 17.689655172413794, + "frac_reward_zero_std": 0.5, + "grad_norm": 5.136435933319982, + "kl": 2.98046875, + "learning_rate": 2.9029132317017118e-06, + "loss": 0.1391, + "num_tokens": 77994415.0, + "reward": 0.01328125037252903, + "reward_std": 0.03055463545024395, + "rewards/code_format_reward/mean": 0.046875, + "rewards/code_format_reward/std": 0.21220162510871887, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.6875, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 2347.0, + "completions/mean_length": 1600.0390625, + "completions/mean_terminated_length": 605.1046752929688, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 17.75862068965517, + "frac_reward_zero_std": 0.3125, + "grad_norm": 1.9956171932110272, + "kl": 6.34375, + "learning_rate": 2.888369423266629e-06, + "loss": 0.254, + "num_tokens": 78330292.0, + "reward": 0.02734375, + "reward_std": 0.047707222402095795, + "rewards/code_format_reward/mean": 0.09375, + "rewards/code_format_reward/std": 0.29262590408325195, + "rewards/format_reward/mean": 0.1796875, + "rewards/format_reward/std": 0.3854354918003082, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.0625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2239.0, + "completions/mean_length": 1378.984375, + "completions/mean_terminated_length": 578.1224365234375, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 17.82758620689655, + "frac_reward_zero_std": 0.25, + "grad_norm": 5.423762267243065, + "kl": 3.732421875, + "learning_rate": 2.8738198091276712e-06, + "loss": 0.1456, + "num_tokens": 78637642.0, + "reward": 0.02578125149011612, + "reward_std": 0.0503658652305603, + "rewards/code_format_reward/mean": 0.0859375, + "rewards/code_format_reward/std": 0.2813730239868164, + "rewards/format_reward/mean": 0.171875, + "rewards/format_reward/std": 0.3787541687488556, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3603.0, + "completions/mean_length": 1465.65625, + "completions/mean_terminated_length": 757.0425415039062, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 17.896551724137932, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.5512289745895489, + "kl": 1.080078125, + "learning_rate": 2.859264999757509e-06, + "loss": 0.1129, + "num_tokens": 78954782.0, + "reward": 0.01875000074505806, + "reward_std": 0.040201522409915924, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3320184051990509, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3106.0, + "completions/mean_length": 1494.125, + "completions/mean_terminated_length": 739.6382446289062, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 17.96551724137931, + "frac_reward_zero_std": 0.4375, + "grad_norm": 4.597060014349811, + "kl": 1.115234375, + "learning_rate": 2.8447056058467928e-06, + "loss": 0.1514, + "num_tokens": 79275934.0, + "reward": 0.01328125037252903, + "reward_std": 0.028737083077430725, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.1015625, + "rewards/format_reward/std": 0.3032590448856354, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.21875, + "completions/max_length": 3973.0, + "completions/max_terminated_length": 2990.0, + "completions/mean_length": 1130.59375, + "completions/mean_terminated_length": 561.3689575195312, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 18.06896551724138, + "frac_reward_zero_std": 0.25, + "grad_norm": 5.182421375878051, + "kl": 0.8466796875, + "learning_rate": 2.830142238278531e-06, + "loss": 0.1517, + "num_tokens": 79551722.0, + "reward": 0.02734375186264515, + "reward_std": 0.04687388986349106, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.2109375, + "rewards/format_reward/std": 0.4095771610736847, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.34375, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 2633.0, + "completions/mean_length": 1148.2265625, + "completions/mean_terminated_length": 685.1307983398438, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 18.137931034482758, + "frac_reward_zero_std": 0.125, + "grad_norm": 32.76938325038699, + "kl": 2.17578125, + "learning_rate": 2.81557550810246e-06, + "loss": 0.1685, + "num_tokens": 79829767.0, + "reward": 0.03125, + "reward_std": 0.058743592351675034, + "rewards/code_format_reward/mean": 0.109375, + "rewards/code_format_reward/std": 0.31333550810813904, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.40390563011169434, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.21875, + "completions/max_length": 3989.0, + "completions/max_terminated_length": 2692.0, + "completions/mean_length": 1138.234375, + "completions/mean_terminated_length": 537.9514770507812, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 18.20689655172414, + "frac_reward_zero_std": 0.4375, + "grad_norm": 1.481458883574915, + "kl": 2.66650390625, + "learning_rate": 2.8010060265094026e-06, + "loss": 0.0941, + "num_tokens": 80106533.0, + "reward": 0.01953125, + "reward_std": 0.03758678585290909, + "rewards/code_format_reward/mean": 0.0703125, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3320184051990509, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 3841.0, + "completions/max_terminated_length": 3841.0, + "completions/mean_length": 1103.625, + "completions/mean_terminated_length": 743.2232666015625, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 18.275862068965516, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.1199833226592513, + "kl": 0.8916015625, + "learning_rate": 2.786434404805629e-06, + "loss": 0.1961, + "num_tokens": 80378637.0, + "reward": 0.02031249925494194, + "reward_std": 0.04190831631422043, + "rewards/code_format_reward/mean": 0.0546875, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.1484375, + "rewards/format_reward/std": 0.356930136680603, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.03125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2547.0, + "completions/mean_length": 1401.6484375, + "completions/mean_terminated_length": 601.9381103515625, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 18.344827586206897, + "frac_reward_zero_std": 0.1875, + "grad_norm": 7.691397550402843, + "kl": 4.43359375, + "learning_rate": 2.771861254387199e-06, + "loss": 0.3108, + "num_tokens": 80688680.0, + "reward": 0.02421875111758709, + "reward_std": 0.044612735509872437, + "rewards/code_format_reward/mean": 0.0546875, + "rewards/code_format_reward/std": 0.22826264798641205, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.39184603095054626, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5, + "completions/max_length": 3766.0, + "completions/max_terminated_length": 3421.0, + "completions/mean_length": 1048.765625, + "completions/mean_terminated_length": 709.5000610351562, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 18.413793103448278, + "frac_reward_zero_std": 0.125, + "grad_norm": 2.0199364421525243, + "kl": 5.18212890625, + "learning_rate": 2.7572871867143204e-06, + "loss": 0.1734, + "num_tokens": 80951730.0, + "reward": 0.02421874925494194, + "reward_std": 0.05020953714847565, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.1796875, + "rewards/format_reward/std": 0.3854354918003082, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.65625, + "completions/max_length": 3779.0, + "completions/max_terminated_length": 2666.0, + "completions/mean_length": 837.921875, + "completions/mean_terminated_length": 614.5641479492188, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 18.482758620689655, + "frac_reward_zero_std": 0.125, + "grad_norm": 2.2037671170022564, + "kl": 1.66796875, + "learning_rate": 2.742712813285681e-06, + "loss": 0.1824, + "num_tokens": 81190056.0, + "reward": 0.02656250074505806, + "reward_std": 0.051087357103824615, + "rewards/code_format_reward/mean": 0.0703125, + "rewards/code_format_reward/std": 0.2566775679588318, + "rewards/format_reward/mean": 0.1953125, + "rewards/format_reward/std": 0.3979988098144531, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.6875, + "completions/max_length": 3981.0, + "completions/max_terminated_length": 3981.0, + "completions/mean_length": 611.421875, + "completions/mean_terminated_length": 415.0254211425781, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 18.551724137931036, + "frac_reward_zero_std": 0.125, + "grad_norm": 6.316499749233628, + "kl": 5.046875, + "learning_rate": 2.7281387456128017e-06, + "loss": 0.1215, + "num_tokens": 81398222.0, + "reward": 0.02734375, + "reward_std": 0.051739681512117386, + "rewards/code_format_reward/mean": 0.0625, + "rewards/code_format_reward/std": 0.24301259219646454, + "rewards/format_reward/mean": 0.2109375, + "rewards/format_reward/std": 0.4095771610736847, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.625, + "completions/max_length": 3816.0, + "completions/max_terminated_length": 3276.0, + "completions/mean_length": 763.0234375, + "completions/mean_terminated_length": 485.97412109375, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 18.620689655172413, + "frac_reward_zero_std": 0.25, + "grad_norm": 3.686627518077638, + "kl": 4.93212890625, + "learning_rate": 2.7135655951943716e-06, + "loss": 0.174, + "num_tokens": 81626961.0, + "reward": 0.0234375, + "reward_std": 0.03926009684801102, + "rewards/code_format_reward/mean": 0.03125, + "rewards/code_format_reward/std": 0.1746762990951538, + "rewards/format_reward/mean": 0.203125, + "rewards/format_reward/std": 0.40390563011169434, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.5625, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 2366.0, + "completions/mean_length": 721.484375, + "completions/mean_terminated_length": 390.7719421386719, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 18.689655172413794, + "frac_reward_zero_std": 0.0, + "grad_norm": 10.577008880883428, + "kl": 9.1611328125, + "learning_rate": 2.698993973490598e-06, + "loss": 0.257, + "num_tokens": 81850383.0, + "reward": 0.02890624850988388, + "reward_std": 0.05324123799800873, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.25, + "rewards/format_reward/std": 0.434714138507843, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 2860.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 449.2578125, + "completions/mean_terminated_length": 372.7131042480469, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 18.75862068965517, + "frac_reward_zero_std": 0.25, + "grad_norm": 6.837143928180438, + "kl": 1.724609375, + "learning_rate": 2.6844244918975416e-06, + "loss": 0.1715, + "num_tokens": 82038056.0, + "reward": 0.02265625074505806, + "reward_std": 0.042552001774311066, + "rewards/code_format_reward/mean": 0.0390625, + "rewards/code_format_reward/std": 0.194504976272583, + "rewards/format_reward/mean": 0.1875, + "rewards/format_reward/std": 0.39184603095054626, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 2975.0, + "completions/max_terminated_length": 1633.0, + "completions/mean_length": 416.8125, + "completions/mean_terminated_length": 328.70489501953125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 18.82758620689655, + "frac_reward_zero_std": 0.125, + "grad_norm": 53.29809766701044, + "kl": 9.4677734375, + "learning_rate": 2.66985776172147e-06, + "loss": 0.3514, + "num_tokens": 82222480.0, + "reward": 0.01875000074505806, + "reward_std": 0.038377560675144196, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.1796875, + "rewards/format_reward/std": 0.3854354918003082, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.84375, + "completions/max_length": 3628.0, + "completions/max_terminated_length": 3545.0, + "completions/mean_length": 472.5625, + "completions/mean_terminated_length": 401.1788330078125, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 18.896551724137932, + "frac_reward_zero_std": 0.25, + "grad_norm": 2.5521363288691994, + "kl": 0.8603515625, + "learning_rate": 2.6552943941532088e-06, + "loss": 0.2253, + "num_tokens": 82412896.0, + "reward": 0.02109375037252903, + "reward_std": 0.03715597838163376, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.1953125, + "rewards/format_reward/std": 0.3979988098144531, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.8125, + "completions/max_length": 1946.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 361.4375, + "completions/mean_terminated_length": 295.4098205566406, + "completions/min_length": 5.0, + "completions/min_terminated_length": 5.0, + "epoch": 18.96551724137931, + "frac_reward_zero_std": 0.1875, + "grad_norm": 1.9363820641920646, + "kl": 1.884765625, + "learning_rate": 2.6407350002424927e-06, + "loss": 0.1635, + "num_tokens": 82590232.0, + "reward": 0.02578125149011612, + "reward_std": 0.03964492678642273, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42527204751968384, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 3701.0, + "completions/max_terminated_length": 2376.0, + "completions/mean_length": 302.90625, + "completions/mean_terminated_length": 252.00001525878906, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 19.06896551724138, + "frac_reward_zero_std": 0.0625, + "grad_norm": 2.944937731689937, + "kl": 2.4765625, + "learning_rate": 2.626180190872329e-06, + "loss": 0.339, + "num_tokens": 82760076.0, + "reward": 0.02734375, + "reward_std": 0.046176549047231674, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.2578125, + "rewards/format_reward/std": 0.43914902210235596, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.9375, + "completions/max_length": 1493.0, + "completions/max_terminated_length": 1493.0, + "completions/mean_length": 242.21875, + "completions/mean_terminated_length": 230.05557250976562, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 19.137931034482758, + "frac_reward_zero_std": 0.0, + "grad_norm": 15.57910661584801, + "kl": 11.6650390625, + "learning_rate": 2.611630576733372e-06, + "loss": 0.4444, + "num_tokens": 82921008.0, + "reward": 0.03046875074505806, + "reward_std": 0.051431089639663696, + "rewards/code_format_reward/mean": 0.0234375, + "rewards/code_format_reward/std": 0.15188287198543549, + "rewards/format_reward/mean": 0.28125, + "rewards/format_reward/std": 0.4513758420944214, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.875, + "completions/max_length": 2553.0, + "completions/max_terminated_length": 2553.0, + "completions/mean_length": 179.9765625, + "completions/mean_terminated_length": 150.3064422607422, + "completions/min_length": 4.0, + "completions/min_terminated_length": 4.0, + "epoch": 19.20689655172414, + "frac_reward_zero_std": 0.0, + "grad_norm": 7.323966437247427, + "kl": 12.451171875, + "learning_rate": 2.5970867682982885e-06, + "loss": 0.7394, + "num_tokens": 83074885.0, + "reward": 0.04296875, + "reward_std": 0.05002420395612717, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.4296875, + "rewards/format_reward/std": 0.4969765841960907, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 82.8984375, + "completions/mean_terminated_length": 79.51181030273438, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 19.275862068965516, + "frac_reward_zero_std": 0.0, + "grad_norm": 8.931857110772658, + "kl": 3.00390625, + "learning_rate": 2.582549375796154e-06, + "loss": 0.4891, + "num_tokens": 83216568.0, + "reward": 0.04843749850988388, + "reward_std": 0.05170843005180359, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.484375, + "rewards/format_reward/std": 0.5017194747924805, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.90625, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 85.921875, + "completions/mean_terminated_length": 77.81600189208984, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 19.344827586206897, + "frac_reward_zero_std": 0.0, + "grad_norm": 19.11515653931392, + "kl": 9.69921875, + "learning_rate": 2.568019009186841e-06, + "loss": 0.5259, + "num_tokens": 83356638.0, + "reward": 0.04140625149011612, + "reward_std": 0.05091936141252518, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.4140625, + "rewards/format_reward/std": 0.49449479579925537, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.96875, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 67.5078125, + "completions/mean_terminated_length": 66.23622131347656, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 19.413793103448278, + "frac_reward_zero_std": 0.0, + "grad_norm": 17.639456329459385, + "kl": 6.4296875, + "learning_rate": 2.5534962781354317e-06, + "loss": 0.4702, + "num_tokens": 83496351.0, + "reward": 0.04296875, + "reward_std": 0.049871139228343964, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.4296875, + "rewards/format_reward/std": 0.4969765841960907, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.75, + "completions/max_length": 3172.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 215.1796875, + "completions/mean_terminated_length": 137.94168090820312, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 19.482758620689655, + "frac_reward_zero_std": 0.125, + "grad_norm": 8.866046319285678, + "kl": 9.9765625, + "learning_rate": 2.538981791986634e-06, + "loss": 0.4059, + "num_tokens": 83654966.0, + "reward": 0.0234375, + "reward_std": 0.03877146169543266, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42527204751968384, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -2.40625, + "completions/max_length": 3662.0, + "completions/max_terminated_length": 2852.0, + "completions/mean_length": 861.859375, + "completions/mean_terminated_length": 471.21099853515625, + "completions/min_length": 2.0, + "completions/min_terminated_length": 2.0, + "epoch": 19.551724137931036, + "frac_reward_zero_std": 0.3125, + "grad_norm": 78.11921241015132, + "kl": 51.28125, + "learning_rate": 2.524476159739218e-06, + "loss": 0.5243, + "num_tokens": 83895188.0, + "reward": 0.012500000186264515, + "reward_std": 0.027382206171751022, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3320184051990509, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -1.59375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3298.0, + "completions/mean_length": 1974.359375, + "completions/mean_terminated_length": 870.5783081054688, + "completions/min_length": 7.0, + "completions/min_terminated_length": 7.0, + "epoch": 19.620689655172413, + "frac_reward_zero_std": 0.5, + "grad_norm": 147.7465652644851, + "kl": 100.5, + "learning_rate": 2.5099799900204607e-06, + "loss": 1.0181, + "num_tokens": 84277810.0, + "reward": 0.007031249813735485, + "reward_std": 0.018361147493124008, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": -0.21875, + "completions/max_length": 3954.0, + "completions/max_terminated_length": 3934.0, + "completions/mean_length": 2908.65625, + "completions/mean_terminated_length": 965.1282348632812, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 19.689655172413794, + "frac_reward_zero_std": 0.8125, + "grad_norm": 65.90059415462332, + "kl": 47.6875, + "learning_rate": 2.4954938910606108e-06, + "loss": 0.4934, + "num_tokens": 84781190.0, + "reward": 0.0023437500931322575, + "reward_std": 0.00662912568077445, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.21875, + "completions/max_length": 3964.0, + "completions/max_terminated_length": 3425.0, + "completions/mean_length": 3164.609375, + "completions/mean_terminated_length": 1118.7999267578125, + "completions/min_length": 6.0, + "completions/min_terminated_length": 6.0, + "epoch": 19.75862068965517, + "frac_reward_zero_std": 0.9375, + "grad_norm": 44.797485068125546, + "kl": 32.0625, + "learning_rate": 2.481018470667368e-06, + "loss": 0.3168, + "num_tokens": 85317332.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.625, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 2285.0, + "completions/mean_length": 3499.0546875, + "completions/mean_terminated_length": 694.1666870117188, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 19.82758620689655, + "frac_reward_zero_std": 0.6875, + "grad_norm": 14.746233541391186, + "kl": 10.765625, + "learning_rate": 2.4665543362003802e-06, + "loss": 0.1122, + "num_tokens": 85896283.0, + "reward": 0.004687500186264515, + "reward_std": 0.01173202134668827, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 3891.953125, + "completions/mean_terminated_length": 917.2000122070312, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "epoch": 19.896551724137932, + "frac_reward_zero_std": 0.75, + "grad_norm": 14.791600919357032, + "kl": 8.171875, + "learning_rate": 2.4521020945457615e-06, + "loss": 0.079, + "num_tokens": 86525085.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.03125, + "rewards/format_reward/std": 0.1746762990951538, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 1048.0, + "completions/mean_length": 3757.9296875, + "completions/mean_terminated_length": 701.5, + "completions/min_length": 355.0, + "completions/min_terminated_length": 355.0, + "epoch": 19.96551724137931, + "frac_reward_zero_std": 0.75, + "grad_norm": 7.958211027904403, + "kl": 6.0234375, + "learning_rate": 2.4376623520906255e-06, + "loss": 0.0593, + "num_tokens": 87137172.0, + "reward": 0.00390625, + "reward_std": 0.009522313252091408, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4078.0, + "completions/max_terminated_length": 1047.0, + "completions/mean_length": 3547.34375, + "completions/mean_terminated_length": 1047.0, + "completions/min_length": 1047.0, + "completions/min_terminated_length": 1047.0, + "epoch": 20.06896551724138, + "frac_reward_zero_std": 0.6875, + "grad_norm": 5.354544835330795, + "kl": 2.328125, + "learning_rate": 2.4232357146976478e-06, + "loss": 0.0242, + "num_tokens": 87722304.0, + "reward": 0.007031249813735485, + "reward_std": 0.012863079085946083, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0703125, + "rewards/format_reward/std": 0.2566775679588318, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 3712.1796875, + "completions/mean_terminated_length": 644.6666870117188, + "completions/min_length": 454.0, + "completions/min_terminated_length": 454.0, + "epoch": 20.137931034482758, + "frac_reward_zero_std": 0.625, + "grad_norm": 11.135894730059231, + "kl": 1.595703125, + "learning_rate": 2.408822787679637e-06, + "loss": 0.0132, + "num_tokens": 88328535.0, + "reward": 0.0078125, + "reward_std": 0.01507278811186552, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.2694226801395416, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3968.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 3798.5, + "completions/mean_terminated_length": 393.0, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 20.20689655172414, + "frac_reward_zero_std": 0.375, + "grad_norm": 13.397806501206668, + "kl": 0.9453125, + "learning_rate": 2.3944241757741475e-06, + "loss": 0.014, + "num_tokens": 88945815.0, + "reward": 0.014062500558793545, + "reward_std": 0.02688095159828663, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.140625, + "rewards/format_reward/std": 0.3490002751350403, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3989.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 3693.9921875, + "completions/mean_terminated_length": 788.0, + "completions/min_length": 788.0, + "completions/min_terminated_length": 788.0, + "epoch": 20.275862068965516, + "frac_reward_zero_std": 0.3125, + "grad_norm": 16.532779819726088, + "kl": 1.384765625, + "learning_rate": 2.380040483118097e-06, + "loss": 0.0148, + "num_tokens": 89549718.0, + "reward": 0.01250000111758709, + "reward_std": 0.027382206171751022, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3320184051990509, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3993.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3865.0, + "completions/min_terminated_length": 0.0, + "epoch": 20.344827586206897, + "frac_reward_zero_std": 0.125, + "grad_norm": 31.110583034199667, + "kl": 2.390625, + "learning_rate": 2.365672313222419e-06, + "loss": 0.0239, + "num_tokens": 90191662.0, + "reward": 0.01796875149011612, + "reward_std": 0.03640326112508774, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.1796875, + "rewards/format_reward/std": 0.3854354918003082, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3378.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2581.0, + "completions/min_terminated_length": 0.0, + "epoch": 20.413793103448278, + "frac_reward_zero_std": 0.125, + "grad_norm": 26.367316082400894, + "kl": 3.13671875, + "learning_rate": 2.351320268946749e-06, + "loss": 0.0314, + "num_tokens": 90755118.0, + "reward": 0.02265625074505806, + "reward_std": 0.03753383085131645, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.2265625, + "rewards/format_reward/std": 0.4202519655227661, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3311.0, + "completions/max_terminated_length": 90.0, + "completions/mean_length": 2773.25, + "completions/mean_terminated_length": 90.0, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 20.482758620689655, + "frac_reward_zero_std": 0.1875, + "grad_norm": 102.269291439468, + "kl": 2.6015625, + "learning_rate": 2.336984952474119e-06, + "loss": 0.0289, + "num_tokens": 91241166.0, + "reward": 0.0234375, + "reward_std": 0.035088710486888885, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42527204751968384, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3876.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3458.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3164.0, + "completions/min_terminated_length": 0.0, + "epoch": 20.551724137931036, + "frac_reward_zero_std": 0.0, + "grad_norm": 72.51493009270023, + "kl": 2.42578125, + "learning_rate": 2.322666965285697e-06, + "loss": 0.0243, + "num_tokens": 91814894.0, + "reward": 0.02421875111758709, + "reward_std": 0.043605104088783264, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.234375, + "rewards/format_reward/std": 0.42527204751968384, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3092.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2445.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 0.0, + "epoch": 20.620689655172413, + "frac_reward_zero_std": 0.25, + "grad_norm": 78.00543847542036, + "kl": 2.6484375, + "learning_rate": 2.3083669081355507e-06, + "loss": 0.0265, + "num_tokens": 92258958.0, + "reward": 0.015625, + "reward_std": 0.03164235129952431, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.15625, + "rewards/format_reward/std": 0.3645188808441162, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3562.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2766.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1714.0, + "completions/min_terminated_length": 0.0, + "epoch": 20.689655172413794, + "frac_reward_zero_std": 0.5, + "grad_norm": 26.563040880287073, + "kl": 3.9765625, + "learning_rate": 2.2940853810254377e-06, + "loss": 0.0398, + "num_tokens": 92744110.0, + "reward": 0.00937500037252903, + "reward_std": 0.020875994116067886, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0859375, + "rewards/format_reward/std": 0.2813730239868164, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3666.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3145.0, + "completions/min_terminated_length": 0.0, + "epoch": 20.75862068965517, + "frac_reward_zero_std": 0.5, + "grad_norm": 53.7635867052342, + "kl": 18.375, + "learning_rate": 2.2798229831796313e-06, + "loss": 0.184, + "num_tokens": 93342214.0, + "reward": 0.0078125, + "reward_std": 0.019044626504182816, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.078125, + "rewards/format_reward/std": 0.2694226801395416, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3069.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1434.0, + "completions/min_terminated_length": 0.0, + "epoch": 20.82758620689655, + "frac_reward_zero_std": 0.9375, + "grad_norm": 17.9323099281784, + "kl": 7.25, + "learning_rate": 2.2655803130197816e-06, + "loss": 0.0726, + "num_tokens": 93863854.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2860.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 855.0, + "completions/min_terminated_length": 0.0, + "epoch": 20.896551724137932, + "frac_reward_zero_std": 0.875, + "grad_norm": 7.2071875811335255, + "kl": 4.671875, + "learning_rate": 2.2513579681398034e-06, + "loss": 0.0468, + "num_tokens": 94361038.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 2156.0390625, + "completions/mean_terminated_length": 695.5, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 20.96551724137931, + "frac_reward_zero_std": 0.8125, + "grad_norm": 6.4429732989107045, + "kl": 3.7734375, + "learning_rate": 2.237156545280803e-06, + "loss": 0.0392, + "num_tokens": 94766739.0, + "reward": 0.004687500186264515, + "reward_std": 0.0077601829543709755, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3844.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3617.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3213.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.06896551724138, + "frac_reward_zero_std": 0.4375, + "grad_norm": 40.14795749530242, + "kl": 2.421875, + "learning_rate": 2.2229766403060403e-06, + "loss": 0.0242, + "num_tokens": 95360883.0, + "reward": 0.00937500037252903, + "reward_std": 0.021018434315919876, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29262590408325195, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3431.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2625.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1581.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.137931034482758, + "frac_reward_zero_std": 0.6875, + "grad_norm": 21.982058945015865, + "kl": 3.59765625, + "learning_rate": 2.2088188481759305e-06, + "loss": 0.036, + "num_tokens": 95826851.0, + "reward": 0.004687500186264515, + "reward_std": 0.01173202134668827, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3675.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3390.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.20689655172414, + "frac_reward_zero_std": 0.3125, + "grad_norm": 32.96333597644167, + "kl": 3.59375, + "learning_rate": 2.194683762923073e-06, + "loss": 0.036, + "num_tokens": 96428323.0, + "reward": 0.01171875, + "reward_std": 0.026698727160692215, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.1171875, + "rewards/format_reward/std": 0.322907418012619, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3617.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3172.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.275862068965516, + "frac_reward_zero_std": 0.4375, + "grad_norm": 36.29651807443635, + "kl": 2.99609375, + "learning_rate": 2.1805719776273387e-06, + "loss": 0.03, + "num_tokens": 97022371.0, + "reward": 0.00937500037252903, + "reward_std": 0.021937813609838486, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.09375, + "rewards/format_reward/std": 0.29262590408325195, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3392.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2620.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.344827586206897, + "frac_reward_zero_std": 0.3125, + "grad_norm": 108.61061705256071, + "kl": 7.4140625, + "learning_rate": 2.166484084390974e-06, + "loss": 0.0743, + "num_tokens": 97586515.0, + "reward": 0.01249999925494194, + "reward_std": 0.027382206171751022, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.125, + "rewards/format_reward/std": 0.3320184051990509, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3954.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 3304.625, + "completions/mean_terminated_length": 142.0, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 21.413793103448278, + "frac_reward_zero_std": 0.6875, + "grad_norm": 77.59864154798734, + "kl": 20.03125, + "learning_rate": 2.1524206743137636e-06, + "loss": 0.1997, + "num_tokens": 98140579.0, + "reward": 0.004687500186264515, + "reward_std": 0.011732022278010845, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.046875, + "rewards/format_reward/std": 0.21220162510871887, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 4029.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3961.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.482758620689655, + "frac_reward_zero_std": 0.6875, + "grad_norm": 49.341691513169614, + "kl": 7.4609375, + "learning_rate": 2.1383823374682287e-06, + "loss": 0.0746, + "num_tokens": 98785411.0, + "reward": 0.00390625, + "reward_std": 0.011048542335629463, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3212.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2480.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1502.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 9.020518030640764, + "kl": 4.37890625, + "learning_rate": 2.124369662874868e-06, + "loss": 0.0438, + "num_tokens": 99232859.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3966.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2278.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 683.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 6.851405408286888, + "kl": 3.51953125, + "learning_rate": 2.110383238477441e-06, + "loss": 0.0352, + "num_tokens": 99655379.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3984.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1785.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 710.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.5568597923301497, + "kl": 3.7109375, + "learning_rate": 2.096423651118305e-06, + "loss": 0.0371, + "num_tokens": 100014995.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3675.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2987.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2231.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.4105407866695843, + "kl": 2.31640625, + "learning_rate": 2.082491486513788e-06, + "loss": 0.0232, + "num_tokens": 100528499.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 2473.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 1712.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 791.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.6596995368025504, + "kl": 2.14453125, + "learning_rate": 2.0685873292296116e-06, + "loss": 0.0214, + "num_tokens": 100878739.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3310.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 2706.03125, + "completions/mean_terminated_length": 70.0, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 21.896551724137932, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.7381919799536105, + "kl": 1.734375, + "learning_rate": 2.054711762656369e-06, + "loss": 0.0173, + "num_tokens": 101356183.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3623.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3105.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1846.0, + "completions/min_terminated_length": 0.0, + "epoch": 21.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3083439279190165, + "kl": 1.814453125, + "learning_rate": 2.040865368985044e-06, + "loss": 0.0181, + "num_tokens": 101884287.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3723.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3478.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.43559275741805176, + "kl": 1.3828125, + "learning_rate": 2.027048729182583e-06, + "loss": 0.0138, + "num_tokens": 102491967.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 141.0, + "completions/mean_length": 3766.6015625, + "completions/mean_terminated_length": 141.0, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 22.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.4659577239042336, + "kl": 1.4140625, + "learning_rate": 2.0132624229675205e-06, + "loss": 0.0141, + "num_tokens": 103103396.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3889.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3629.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3302.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.20689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.088985103033209, + "kl": 1.3203125, + "learning_rate": 1.9995070287856546e-06, + "loss": 0.0132, + "num_tokens": 103697844.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2738.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1453.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3899575907826617, + "kl": 1.3359375, + "learning_rate": 1.985783123785774e-06, + "loss": 0.0134, + "num_tokens": 104179444.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3551.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2919.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.344827586206897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9200105360180978, + "kl": 1.458984375, + "learning_rate": 1.9720912837954486e-06, + "loss": 0.0145, + "num_tokens": 104764172.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3989.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3143.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2597.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.413793103448278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4396045362414407, + "kl": 1.22265625, + "learning_rate": 1.958432083296862e-06, + "loss": 0.0122, + "num_tokens": 105296444.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3844.0, + "completions/max_terminated_length": 1613.0, + "completions/mean_length": 3476.734375, + "completions/mean_terminated_length": 1613.0, + "completions/min_length": 1613.0, + "completions/min_terminated_length": 1613.0, + "epoch": 22.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.31437757247901466, + "kl": 1.14453125, + "learning_rate": 1.9448060954027093e-06, + "loss": 0.0114, + "num_tokens": 105872538.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3627.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3438.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3190.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3801019010665978, + "kl": 1.21875, + "learning_rate": 1.931213891832153e-06, + "loss": 0.0122, + "num_tokens": 106443674.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3405.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2850.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.41862411846040465, + "kl": 1.1328125, + "learning_rate": 1.9176560428868336e-06, + "loss": 0.0113, + "num_tokens": 107010618.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3961.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3497.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3145.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15447039288221498, + "kl": 1.00390625, + "learning_rate": 1.9041331174269373e-06, + "loss": 0.01, + "num_tokens": 107589402.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3716.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3190.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.44569839064244576, + "kl": 1.1328125, + "learning_rate": 1.8906456828473341e-06, + "loss": 0.0113, + "num_tokens": 108196122.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3481.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3044.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1877332615017673, + "kl": 1.138671875, + "learning_rate": 1.8771943050537656e-06, + "loss": 0.0114, + "num_tokens": 108771682.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3751.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3572.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3283.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.896551724137932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.1368502872613123, + "kl": 0.984375, + "learning_rate": 1.8637795484391046e-06, + "loss": 0.0098, + "num_tokens": 109360034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3816.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3444.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3131.0, + "completions/min_terminated_length": 0.0, + "epoch": 22.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18495732660292058, + "kl": 0.953125, + "learning_rate": 1.8504019758596698e-06, + "loss": 0.0095, + "num_tokens": 109932034.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4084.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3776.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3607.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2997038986166403, + "kl": 0.958984375, + "learning_rate": 1.8370621486116163e-06, + "loss": 0.0096, + "num_tokens": 110546530.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3697.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3431.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4687330139312712, + "kl": 1.041015625, + "learning_rate": 1.823760626407377e-06, + "loss": 0.0104, + "num_tokens": 111149746.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3889.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3518.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3123.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.20689655172414, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.2686906307875616, + "kl": 0.916015625, + "learning_rate": 1.8104979673521838e-06, + "loss": 0.0092, + "num_tokens": 111731122.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3933.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3698.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3483.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4459883207937827, + "kl": 0.9375, + "learning_rate": 1.7972747279206482e-06, + "loss": 0.0094, + "num_tokens": 112335602.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3656.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 3125.5, + "completions/mean_terminated_length": 1096.0, + "completions/min_length": 1096.0, + "completions/min_terminated_length": 1096.0, + "epoch": 23.344827586206897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.47023729166382977, + "kl": 1.03125, + "learning_rate": 1.7840914629334122e-06, + "loss": 0.0103, + "num_tokens": 112866298.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3708.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3310.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.413793103448278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.44946285877291525, + "kl": 1.0625, + "learning_rate": 1.7709487255338731e-06, + "loss": 0.0106, + "num_tokens": 113470826.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3786.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3092.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.482758620689655, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.434112729967371, + "kl": 0.98046875, + "learning_rate": 1.7578470671649684e-06, + "loss": 0.0098, + "num_tokens": 114084330.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 807.0, + "completions/mean_length": 3595.0625, + "completions/mean_terminated_length": 807.0, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "epoch": 23.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24483285658126575, + "kl": 1.0546875, + "learning_rate": 1.744787037546045e-06, + "loss": 0.0105, + "num_tokens": 114675570.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3662.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3582.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3431.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.620689655172413, + "frac_reward_zero_std": 0.9375, + "grad_norm": 3.836642163015548, + "kl": 1.03515625, + "learning_rate": 1.731769184649788e-06, + "loss": 0.0104, + "num_tokens": 115265138.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1480.0, + "completions/mean_length": 3415.7265625, + "completions/mean_terminated_length": 1480.0, + "completions/min_length": 1480.0, + "completions/min_terminated_length": 1480.0, + "epoch": 23.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.9387186601541986, + "kl": 1.12109375, + "learning_rate": 1.7187940546792325e-06, + "loss": 0.0112, + "num_tokens": 115833191.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3782.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3565.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.75862068965517, + "frac_reward_zero_std": 0.875, + "grad_norm": 9.46227286683567, + "kl": 1.1640625, + "learning_rate": 1.7058621920448465e-06, + "loss": 0.0117, + "num_tokens": 116448423.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3779.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3566.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3219.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.46798818186427327, + "kl": 1.232421875, + "learning_rate": 1.6929741393416855e-06, + "loss": 0.0123, + "num_tokens": 117035039.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3855.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3253.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2673.0, + "completions/min_terminated_length": 0.0, + "epoch": 23.896551724137932, + "frac_reward_zero_std": 0.875, + "grad_norm": 8.866925669400183, + "kl": 1.455078125, + "learning_rate": 1.6801304373266286e-06, + "loss": 0.0145, + "num_tokens": 117582495.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 3627.78125, + "completions/mean_terminated_length": 433.0, + "completions/min_length": 433.0, + "completions/min_terminated_length": 433.0, + "epoch": 23.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4121912299594654, + "kl": 1.134765625, + "learning_rate": 1.667331624895689e-06, + "loss": 0.0113, + "num_tokens": 118177923.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 2956.609375, + "completions/mean_terminated_length": 326.0, + "completions/min_length": 210.0, + "completions/min_terminated_length": 210.0, + "epoch": 24.06896551724138, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.839985387205673, + "kl": 1.29296875, + "learning_rate": 1.6545782390614037e-06, + "loss": 0.013, + "num_tokens": 118686273.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3457.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 2946.4453125, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 24.137931034482758, + "frac_reward_zero_std": 0.9375, + "grad_norm": 13.455888197603404, + "kl": 2.7734375, + "learning_rate": 1.6418708149302992e-06, + "loss": 0.0251, + "num_tokens": 119194490.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3954.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3542.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3078.0, + "completions/min_terminated_length": 0.0, + "epoch": 24.20689655172414, + "frac_reward_zero_std": 0.9375, + "grad_norm": 3.0314910559580888, + "kl": 1.607421875, + "learning_rate": 1.6292098856804423e-06, + "loss": 0.0161, + "num_tokens": 119778970.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3805.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3634.0, + "completions/min_terminated_length": 0.0, + "epoch": 24.275862068965516, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.1375967851187987, + "kl": 1.853515625, + "learning_rate": 1.6165959825390661e-06, + "loss": 0.0185, + "num_tokens": 120397146.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 746.0, + "completions/mean_length": 3621.578125, + "completions/mean_terminated_length": 746.0, + "completions/min_length": 746.0, + "completions/min_terminated_length": 746.0, + "epoch": 24.344827586206897, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.9099338039446698, + "kl": 1.8203125, + "learning_rate": 1.604029634760284e-06, + "loss": 0.0182, + "num_tokens": 120991548.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3737.7890625, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 24.413793103448278, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.8118569722283366, + "kl": 1.337890625, + "learning_rate": 1.59151136960288e-06, + "loss": 0.0134, + "num_tokens": 121600153.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3989.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3589.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3092.0, + "completions/min_terminated_length": 0.0, + "epoch": 24.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7367641169948803, + "kl": 1.10546875, + "learning_rate": 1.5790417123081903e-06, + "loss": 0.0111, + "num_tokens": 122190713.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3676.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3458.0, + "completions/min_terminated_length": 0.0, + "epoch": 24.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.046414781901739, + "kl": 1.19921875, + "learning_rate": 1.5666211860780583e-06, + "loss": 0.012, + "num_tokens": 122792409.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3995.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3826.0, + "completions/min_terminated_length": 0.0, + "epoch": 24.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0266921966715032, + "kl": 1.201171875, + "learning_rate": 1.5542503120528918e-06, + "loss": 0.012, + "num_tokens": 123433305.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3639.0, + "completions/max_terminated_length": 1978.0, + "completions/mean_length": 2935.0078125, + "completions/mean_terminated_length": 1978.0, + "completions/min_length": 1691.0, + "completions/min_terminated_length": 1978.0, + "epoch": 24.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4641877307231566, + "kl": 1.13671875, + "learning_rate": 1.5419296092897866e-06, + "loss": 0.0114, + "num_tokens": 123940058.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 3732.40625, + "completions/mean_terminated_length": 475.0, + "completions/min_length": 475.0, + "completions/min_terminated_length": 475.0, + "epoch": 24.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3849148754140799, + "kl": 1.318359375, + "learning_rate": 1.529659594740755e-06, + "loss": 0.0132, + "num_tokens": 124548878.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 3602.09375, + "completions/mean_terminated_length": 474.5, + "completions/min_length": 456.0, + "completions/min_terminated_length": 456.0, + "epoch": 24.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7183176763768305, + "kl": 0.982421875, + "learning_rate": 1.5174407832310338e-06, + "loss": 0.0098, + "num_tokens": 125138706.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3852.0, + "completions/max_terminated_length": 184.0, + "completions/mean_length": 3626.9453125, + "completions/mean_terminated_length": 184.0, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 24.896551724137932, + "frac_reward_zero_std": 0.9375, + "grad_norm": 76.28656625387477, + "kl": 2.8603515625, + "learning_rate": 1.5052736874374815e-06, + "loss": 0.0286, + "num_tokens": 125734027.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3984.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3629.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3053.0, + "completions/min_terminated_length": 0.0, + "epoch": 24.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3820884009065382, + "kl": 0.9228515625, + "learning_rate": 1.4931588178670695e-06, + "loss": 0.0092, + "num_tokens": 126329675.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3876.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 3551.9140625, + "completions/mean_terminated_length": 406.0, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 25.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2964534891394953, + "kl": 0.9296875, + "learning_rate": 1.4810966828354605e-06, + "loss": 0.0093, + "num_tokens": 126915392.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3933.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3479.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2988.0, + "completions/min_terminated_length": 0.0, + "epoch": 25.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.32995971567480264, + "kl": 0.98828125, + "learning_rate": 1.469087788445684e-06, + "loss": 0.0099, + "num_tokens": 127491776.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3658.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3212.0, + "completions/min_terminated_length": 0.0, + "epoch": 25.20689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3010622006281474, + "kl": 0.814453125, + "learning_rate": 1.4571326385668965e-06, + "loss": 0.0081, + "num_tokens": 128091072.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3543.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2844.0, + "completions/min_terminated_length": 0.0, + "epoch": 25.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21401397666059846, + "kl": 0.90625, + "learning_rate": 1.4452317348132434e-06, + "loss": 0.0091, + "num_tokens": 128675712.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3953.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3656.0, + "completions/min_terminated_length": 0.0, + "epoch": 25.344827586206897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21870819130395527, + "kl": 0.8876953125, + "learning_rate": 1.4333855765228104e-06, + "loss": 0.0089, + "num_tokens": 129310592.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3889.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3777.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3701.0, + "completions/min_terminated_length": 0.0, + "epoch": 25.413793103448278, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.9456913420889015, + "kl": 0.796875, + "learning_rate": 1.421594660736675e-06, + "loss": 0.008, + "num_tokens": 129925152.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 178.0, + "completions/mean_length": 3815.8125, + "completions/mean_terminated_length": 133.0, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 25.482758620689655, + "frac_reward_zero_std": 0.9375, + "grad_norm": 4.2622280654233435, + "kl": 0.91796875, + "learning_rate": 1.4098594821780476e-06, + "loss": 0.0069, + "num_tokens": 130543480.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 3614.1484375, + "completions/mean_terminated_length": 383.0, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 25.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20859600720715726, + "kl": 0.8916015625, + "learning_rate": 1.3981805332315174e-06, + "loss": 0.0089, + "num_tokens": 131136723.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 57.0, + "completions/mean_length": 3695.96875, + "completions/mean_terminated_length": 57.0, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 25.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3710955385744262, + "kl": 0.9169921875, + "learning_rate": 1.3865583039223929e-06, + "loss": 0.0092, + "num_tokens": 131739975.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 17.0, + "completions/mean_length": 3649.3359375, + "completions/mean_terminated_length": 17.0, + "completions/min_length": 17.0, + "completions/min_terminated_length": 17.0, + "epoch": 25.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3673494528452197, + "kl": 0.8984375, + "learning_rate": 1.374993281896137e-06, + "loss": 0.009, + "num_tokens": 132337930.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3954.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 3649.5078125, + "completions/mean_terminated_length": 759.0, + "completions/min_length": 759.0, + "completions/min_terminated_length": 759.0, + "epoch": 25.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3644650808148341, + "kl": 0.943359375, + "learning_rate": 1.3634859523979134e-06, + "loss": 0.0094, + "num_tokens": 132936139.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 3989.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 3507.390625, + "completions/mean_terminated_length": 298.66668701171875, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 25.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2894840110413353, + "kl": 0.931640625, + "learning_rate": 1.3520367982522208e-06, + "loss": 0.0093, + "num_tokens": 133516157.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3878.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3626.0, + "completions/min_terminated_length": 0.0, + "epoch": 25.896551724137932, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.3360352116952874, + "kl": 0.966796875, + "learning_rate": 1.3406462998426358e-06, + "loss": 0.0097, + "num_tokens": 134142445.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3532.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3044.0, + "completions/min_terminated_length": 0.0, + "epoch": 25.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.870537760228744, + "kl": 1.0146484375, + "learning_rate": 1.3293149350916595e-06, + "loss": 0.0101, + "num_tokens": 134725677.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3691.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3347.8203125, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 26.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36940153213331306, + "kl": 0.8701171875, + "learning_rate": 1.3180431794406623e-06, + "loss": 0.0087, + "num_tokens": 135285270.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3968.0, + "completions/max_terminated_length": 203.0, + "completions/mean_length": 3864.796875, + "completions/mean_terminated_length": 203.0, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 26.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4201543122757977, + "kl": 0.97265625, + "learning_rate": 1.3068315058299358e-06, + "loss": 0.0097, + "num_tokens": 135911036.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 3575.2890625, + "completions/mean_terminated_length": 143.0, + "completions/min_length": 22.0, + "completions/min_terminated_length": 22.0, + "epoch": 26.20689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3986036615007821, + "kl": 1.0341796875, + "learning_rate": 1.2956803846788503e-06, + "loss": 0.0103, + "num_tokens": 136499745.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3591.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3419.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3190.0, + "completions/min_terminated_length": 0.0, + "epoch": 26.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6253148469350631, + "kl": 1.0625, + "learning_rate": 1.284590283866116e-06, + "loss": 0.0106, + "num_tokens": 137068449.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3729.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3511.0, + "completions/min_terminated_length": 0.0, + "epoch": 26.344827586206897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4167536859641112, + "kl": 0.9130859375, + "learning_rate": 1.2735616687101518e-06, + "loss": 0.0091, + "num_tokens": 137676697.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3898.0, + "completions/max_terminated_length": 81.0, + "completions/mean_length": 3448.6796875, + "completions/mean_terminated_length": 81.0, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 26.413793103448278, + "frac_reward_zero_std": 0.9375, + "grad_norm": 8.432101521755621, + "kl": 0.8310546875, + "learning_rate": 1.2625950019495614e-06, + "loss": 0.0083, + "num_tokens": 138248296.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3964.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 3306.96875, + "completions/mean_terminated_length": 316.0, + "completions/min_length": 316.0, + "completions/min_terminated_length": 316.0, + "epoch": 26.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.279100034883122, + "kl": 0.912109375, + "learning_rate": 1.251690743723718e-06, + "loss": 0.0091, + "num_tokens": 138802660.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 3500.984375, + "completions/mean_terminated_length": 191.5, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 26.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.48801033618279654, + "kl": 0.8671875, + "learning_rate": 1.2408493515534581e-06, + "loss": 0.0087, + "num_tokens": 139380690.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 92.0, + "completions/mean_length": 3878.71875, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 26.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4693294450937292, + "kl": 0.9873046875, + "learning_rate": 1.2300712803218834e-06, + "loss": 0.0099, + "num_tokens": 140008238.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 3580.1796875, + "completions/mean_terminated_length": 273.66668701171875, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 26.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3674590278310469, + "kl": 0.79296875, + "learning_rate": 1.2193569822552772e-06, + "loss": 0.0079, + "num_tokens": 140597133.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3468.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3090.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2554.0, + "completions/min_terminated_length": 0.0, + "epoch": 26.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3323923370329007, + "kl": 0.7861328125, + "learning_rate": 1.2087069069041268e-06, + "loss": 0.0079, + "num_tokens": 141123757.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3712.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3443.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2860.0, + "completions/min_terminated_length": 0.0, + "epoch": 26.82758620689655, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.5532572186614733, + "kl": 0.875, + "learning_rate": 1.1981215011242654e-06, + "loss": 0.0088, + "num_tokens": 141695629.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3749.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3369.0, + "completions/min_terminated_length": 0.0, + "epoch": 26.896551724137932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6855015361222366, + "kl": 0.9169921875, + "learning_rate": 1.1876012090581184e-06, + "loss": 0.0092, + "num_tokens": 142304309.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 3855.0, + "completions/max_terminated_length": 70.0, + "completions/mean_length": 3353.9921875, + "completions/mean_terminated_length": 28.666667938232422, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 26.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7452537798528475, + "kl": 0.845703125, + "learning_rate": 1.177146472116071e-06, + "loss": 0.0085, + "num_tokens": 142863548.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3712.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3230.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2655.0, + "completions/min_terminated_length": 0.0, + "epoch": 27.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15321991613545333, + "kl": 0.88671875, + "learning_rate": 1.1667577289579462e-06, + "loss": 0.0089, + "num_tokens": 143408092.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 16.0, + "completions/mean_length": 3606.203125, + "completions/mean_terminated_length": 16.0, + "completions/min_length": 16.0, + "completions/min_terminated_length": 16.0, + "epoch": 27.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3216790824586849, + "kl": 0.787109375, + "learning_rate": 1.1564354154746007e-06, + "loss": 0.0079, + "num_tokens": 143999854.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 3568.9609375, + "completions/mean_terminated_length": 584.0, + "completions/min_length": 584.0, + "completions/min_terminated_length": 584.0, + "epoch": 27.20689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2293355418323331, + "kl": 0.896484375, + "learning_rate": 1.146179964769635e-06, + "loss": 0.009, + "num_tokens": 144587753.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3702.0, + "completions/max_terminated_length": 521.0, + "completions/mean_length": 3501.671875, + "completions/mean_terminated_length": 521.0, + "completions/min_length": 521.0, + "completions/min_terminated_length": 521.0, + "epoch": 27.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24073059200385705, + "kl": 0.8017578125, + "learning_rate": 1.1359918071412195e-06, + "loss": 0.008, + "num_tokens": 145167039.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3848.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3510.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2987.0, + "completions/min_terminated_length": 0.0, + "epoch": 27.344827586206897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3178637455204856, + "kl": 0.86328125, + "learning_rate": 1.1258713700640456e-06, + "loss": 0.0087, + "num_tokens": 145746951.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 3476.796875, + "completions/mean_terminated_length": 547.0, + "completions/min_length": 547.0, + "completions/min_terminated_length": 547.0, + "epoch": 27.413793103448278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12214403358860393, + "kl": 0.912109375, + "learning_rate": 1.115819078171383e-06, + "loss": 0.0091, + "num_tokens": 146323053.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3989.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 3682.125, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 27.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.12897945574009914, + "kl": 0.7890625, + "learning_rate": 1.1058353532372667e-06, + "loss": 0.0079, + "num_tokens": 146925437.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3627.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3040.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2673.0, + "completions/min_terminated_length": 0.0, + "epoch": 27.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16601461816076052, + "kl": 0.8662109375, + "learning_rate": 1.0959206141587998e-06, + "loss": 0.0087, + "num_tokens": 147444461.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3829.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 2987.171875, + "completions/mean_terminated_length": 114.0, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 27.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6719414486342502, + "kl": 0.9482421875, + "learning_rate": 1.0860752769385766e-06, + "loss": 0.0095, + "num_tokens": 147957891.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3973.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 3645.078125, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 27.689655172413794, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.6787758046147854, + "kl": 0.7978515625, + "learning_rate": 1.0762997546672279e-06, + "loss": 0.008, + "num_tokens": 148554365.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3876.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3511.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3180.0, + "completions/min_terminated_length": 0.0, + "epoch": 27.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8095492451199964, + "kl": 0.876953125, + "learning_rate": 1.0665944575060914e-06, + "loss": 0.0088, + "num_tokens": 149134941.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 9.0, + "completions/mean_length": 3694.421875, + "completions/mean_terminated_length": 9.0, + "completions/min_length": 9.0, + "completions/min_terminated_length": 9.0, + "epoch": 27.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0715340671172988, + "kl": 0.9248046875, + "learning_rate": 1.056959792669997e-06, + "loss": 0.0093, + "num_tokens": 149737755.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3441.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3087.0, + "completions/min_terminated_length": 0.0, + "epoch": 27.896551724137932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4141909984905106, + "kl": 0.892578125, + "learning_rate": 1.0473961644101856e-06, + "loss": 0.0089, + "num_tokens": 150308179.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3622.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3147.0, + "completions/min_terminated_length": 0.0, + "epoch": 27.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2255309268311694, + "kl": 0.7841796875, + "learning_rate": 1.037903973997345e-06, + "loss": 0.0078, + "num_tokens": 150902731.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3541.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2683.0, + "completions/min_terminated_length": 0.0, + "epoch": 28.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3262628560823647, + "kl": 0.7548828125, + "learning_rate": 1.0284836197047737e-06, + "loss": 0.0076, + "num_tokens": 151485939.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3719.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3148.0, + "completions/min_terminated_length": 0.0, + "epoch": 28.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3785603547942102, + "kl": 0.8994140625, + "learning_rate": 1.0191354967916712e-06, + "loss": 0.009, + "num_tokens": 152092171.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1065.0, + "completions/mean_length": 3582.390625, + "completions/mean_terminated_length": 1065.0, + "completions/min_length": 1065.0, + "completions/min_terminated_length": 1065.0, + "epoch": 28.20689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6413165465435274, + "kl": 0.9697265625, + "learning_rate": 1.0098599974865515e-06, + "loss": 0.0097, + "num_tokens": 152680621.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3973.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3708.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3246.0, + "completions/min_terminated_length": 0.0, + "epoch": 28.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.38974890306156273, + "kl": 0.7998046875, + "learning_rate": 1.0006575109707898e-06, + "loss": 0.008, + "num_tokens": 153285253.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3889.0, + "completions/max_terminated_length": 60.0, + "completions/mean_length": 3463.140625, + "completions/mean_terminated_length": 34.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 28.344827586206897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2765363629705189, + "kl": 0.7646484375, + "learning_rate": 9.915284233622877e-07, + "loss": 0.0076, + "num_tokens": 153858439.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3964.0, + "completions/max_terminated_length": 180.0, + "completions/mean_length": 3335.6875, + "completions/mean_terminated_length": 180.0, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 28.413793103448278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.14277807820182337, + "kl": 0.7998046875, + "learning_rate": 9.824731176992796e-07, + "loss": 0.008, + "num_tokens": 154416479.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 1262.0, + "completions/mean_length": 3363.4921875, + "completions/mean_terminated_length": 635.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 28.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.529958856068822, + "kl": 0.9189453125, + "learning_rate": 9.734919739242543e-07, + "loss": 0.0092, + "num_tokens": 154977846.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3562.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3284.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2629.0, + "completions/min_terminated_length": 0.0, + "epoch": 28.551724137931036, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.695894262108347, + "kl": 0.822265625, + "learning_rate": 9.645853688680177e-07, + "loss": 0.0082, + "num_tokens": 155529302.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3855.0, + "completions/max_terminated_length": 3117.0, + "completions/mean_length": 3398.984375, + "completions/mean_terminated_length": 3117.0, + "completions/min_length": 2988.0, + "completions/min_terminated_length": 3117.0, + "epoch": 28.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.9989409896774373, + "kl": 1.080078125, + "learning_rate": 9.557536762338786e-07, + "loss": 0.0108, + "num_tokens": 156095444.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2942.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1817.0, + "completions/min_terminated_length": 0.0, + "epoch": 28.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2000430461736387, + "kl": 1.0009765625, + "learning_rate": 9.46997266581973e-07, + "loss": 0.01, + "num_tokens": 156603124.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3984.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3755.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3439.0, + "completions/min_terminated_length": 0.0, + "epoch": 28.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5851798965154009, + "kl": 0.7685546875, + "learning_rate": 9.383165073137115e-07, + "loss": 0.0077, + "num_tokens": 157214836.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 3886.0, + "completions/max_terminated_length": 793.0, + "completions/mean_length": 3041.4375, + "completions/mean_terminated_length": 299.3333435058594, + "completions/min_length": 18.0, + "completions/min_terminated_length": 18.0, + "epoch": 28.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.28855791960820826, + "kl": 0.796875, + "learning_rate": 9.297117626563687e-07, + "loss": 0.008, + "num_tokens": 157735212.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.84375, + "completions/max_length": 3989.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 3385.515625, + "completions/mean_terminated_length": 170.40000915527344, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 28.896551724137932, + "frac_reward_zero_std": 0.9375, + "grad_norm": 4.115015847542139, + "kl": 0.818359375, + "learning_rate": 9.211833936477957e-07, + "loss": 0.0082, + "num_tokens": 158299630.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 680.0, + "completions/mean_length": 3138.6328125, + "completions/mean_terminated_length": 680.0, + "completions/min_length": 680.0, + "completions/min_terminated_length": 680.0, + "epoch": 28.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36112828901574734, + "kl": 0.8056640625, + "learning_rate": 9.127317581212753e-07, + "loss": 0.0081, + "num_tokens": 158832007.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 3644.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 2845.8359375, + "completions/mean_terminated_length": 131.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 29.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2460400784894155, + "kl": 0.8798828125, + "learning_rate": 9.043572106905084e-07, + "loss": 0.0088, + "num_tokens": 159326906.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3751.0, + "completions/max_terminated_length": 1032.0, + "completions/mean_length": 2984.140625, + "completions/mean_terminated_length": 1032.0, + "completions/min_length": 1032.0, + "completions/min_terminated_length": 1032.0, + "epoch": 29.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.19754937012199905, + "kl": 0.814453125, + "learning_rate": 8.960601027347321e-07, + "loss": 0.0082, + "num_tokens": 159839948.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3691.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3525.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3360.0, + "completions/min_terminated_length": 0.0, + "epoch": 29.20689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.13239906069104662, + "kl": 0.677734375, + "learning_rate": 8.878407823839788e-07, + "loss": 0.0068, + "num_tokens": 160422284.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3713.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3478.0, + "completions/min_terminated_length": 0.0, + "epoch": 29.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.36718771412162815, + "kl": 0.796875, + "learning_rate": 8.796995945044689e-07, + "loss": 0.008, + "num_tokens": 161026644.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3781.0, + "completions/max_terminated_length": 2483.0, + "completions/mean_length": 3147.2578125, + "completions/mean_terminated_length": 1818.5, + "completions/min_length": 1154.0, + "completions/min_terminated_length": 1154.0, + "epoch": 29.344827586206897, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.169416419631883, + "kl": 0.908203125, + "learning_rate": 8.716368806841405e-07, + "loss": 0.0091, + "num_tokens": 161560565.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3779.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3533.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3309.0, + "completions/min_terminated_length": 0.0, + "epoch": 29.413793103448278, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.3795953881794978, + "kl": 0.9765625, + "learning_rate": 8.636529792183171e-07, + "loss": 0.0098, + "num_tokens": 162143925.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3961.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 3567.1953125, + "completions/mean_terminated_length": 302.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 29.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0665685823452116, + "kl": 0.8798828125, + "learning_rate": 8.557482250955144e-07, + "loss": 0.0088, + "num_tokens": 162731598.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3968.0, + "completions/max_terminated_length": 99.0, + "completions/mean_length": 3249.34375, + "completions/mean_terminated_length": 99.0, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 29.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3814851298081842, + "kl": 0.8134765625, + "learning_rate": 8.479229499833844e-07, + "loss": 0.0081, + "num_tokens": 163278586.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4078.0, + "completions/max_terminated_length": 1831.0, + "completions/mean_length": 3718.6328125, + "completions/mean_terminated_length": 1340.5, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "epoch": 29.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15176365945131062, + "kl": 0.7451171875, + "learning_rate": 8.401774822147976e-07, + "loss": 0.0074, + "num_tokens": 163885643.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3175.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2948.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2718.0, + "completions/min_terminated_length": 0.0, + "epoch": 29.689655172413794, + "frac_reward_zero_std": 0.9375, + "grad_norm": 3.058306765611725, + "kl": 0.8154296875, + "learning_rate": 8.325121467740695e-07, + "loss": 0.0082, + "num_tokens": 164394059.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3462.890625, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 29.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.258682687819089, + "kl": 0.7685546875, + "learning_rate": 8.249272652833226e-07, + "loss": 0.0077, + "num_tokens": 164966981.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3675.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 3020.4453125, + "completions/mean_terminated_length": 111.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 29.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3645728533747666, + "kl": 0.7998046875, + "learning_rate": 8.174231559889931e-07, + "loss": 0.008, + "num_tokens": 165484670.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 137.0, + "completions/mean_length": 3034.109375, + "completions/mean_terminated_length": 96.0, + "completions/min_length": 55.0, + "completions/min_terminated_length": 55.0, + "epoch": 29.896551724137932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2787119228400319, + "kl": 0.8466796875, + "learning_rate": 8.100001337484787e-07, + "loss": 0.0085, + "num_tokens": 166004108.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3197.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2383.0, + "completions/min_terminated_length": 0.0, + "epoch": 29.96551724137931, + "frac_reward_zero_std": 0.875, + "grad_norm": 6.186121979015489, + "kl": 0.9208984375, + "learning_rate": 8.026585100169251e-07, + "loss": 0.0092, + "num_tokens": 166543332.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3274.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2390.0, + "completions/min_terminated_length": 0.0, + "epoch": 30.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7446233999958237, + "kl": 0.923828125, + "learning_rate": 7.953985928341601e-07, + "loss": 0.0092, + "num_tokens": 167093036.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3701.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3090.1484375, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 30.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8530934216927875, + "kl": 0.8564453125, + "learning_rate": 7.882206868117693e-07, + "loss": 0.0086, + "num_tokens": 167619647.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 206.0, + "completions/mean_length": 3716.3203125, + "completions/mean_terminated_length": 79.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 30.20689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.0550851964159542, + "kl": 0.939453125, + "learning_rate": 7.81125093120313e-07, + "loss": 0.0094, + "num_tokens": 168226408.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3241.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 3151.453125, + "completions/mean_terminated_length": 406.0, + "completions/min_length": 406.0, + "completions/min_terminated_length": 406.0, + "epoch": 30.275862068965516, + "frac_reward_zero_std": 0.9375, + "grad_norm": 3.987086291107972, + "kl": 0.767578125, + "learning_rate": 7.741121094766916e-07, + "loss": 0.0077, + "num_tokens": 168759962.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3615.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3196.8203125, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 30.344827586206897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20934456348471733, + "kl": 0.673828125, + "learning_rate": 7.671820301316532e-07, + "loss": 0.0067, + "num_tokens": 169299059.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3954.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 3340.90625, + "completions/mean_terminated_length": 193.0, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 30.413793103448278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.26690197328437076, + "kl": 0.869140625, + "learning_rate": 7.603351458574474e-07, + "loss": 0.0087, + "num_tokens": 169857767.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3358.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2638.0, + "completions/min_terminated_length": 0.0, + "epoch": 30.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.15328210113309632, + "kl": 0.771484375, + "learning_rate": 7.535717439356255e-07, + "loss": 0.0077, + "num_tokens": 170416399.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3591.0, + "completions/max_terminated_length": 217.0, + "completions/mean_length": 2907.390625, + "completions/mean_terminated_length": 217.0, + "completions/min_length": 217.0, + "completions/min_terminated_length": 217.0, + "epoch": 30.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.21811897481608625, + "kl": 0.7958984375, + "learning_rate": 7.46892108144986e-07, + "loss": 0.008, + "num_tokens": 170919617.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 62.0, + "completions/mean_length": 3019.265625, + "completions/mean_terminated_length": 33.333335876464844, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 30.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.333253834493737, + "kl": 0.8134765625, + "learning_rate": 7.402965187496697e-07, + "loss": 0.0081, + "num_tokens": 171437155.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3855.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3307.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2262.0, + "completions/min_terminated_length": 0.0, + "epoch": 30.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.223342945786469, + "kl": 0.7802734375, + "learning_rate": 7.337852524873974e-07, + "loss": 0.0078, + "num_tokens": 171991555.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3631.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3297.0, + "completions/min_terminated_length": 0.0, + "epoch": 30.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.29323377756635877, + "kl": 0.9619140625, + "learning_rate": 7.273585825578608e-07, + "loss": 0.0096, + "num_tokens": 172586251.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3222.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2151.0, + "completions/min_terminated_length": 0.0, + "epoch": 30.82758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.35038252371357637, + "kl": 0.8544921875, + "learning_rate": 7.21016778611259e-07, + "loss": 0.0085, + "num_tokens": 173129571.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 1743.0, + "completions/mean_length": 3255.6328125, + "completions/mean_terminated_length": 1743.0, + "completions/min_length": 1743.0, + "completions/min_terminated_length": 1743.0, + "epoch": 30.896551724137932, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18108887766051743, + "kl": 0.69921875, + "learning_rate": 7.147601067369835e-07, + "loss": 0.007, + "num_tokens": 173677364.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 1617.0, + "completions/mean_length": 2668.109375, + "completions/mean_terminated_length": 1617.0, + "completions/min_length": 1617.0, + "completions/min_terminated_length": 1617.0, + "epoch": 30.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.24837273393300843, + "kl": 0.8681640625, + "learning_rate": 7.085888294524561e-07, + "loss": 0.0087, + "num_tokens": 174149954.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3968.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3724.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3568.0, + "completions/min_terminated_length": 0.0, + "epoch": 31.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20597077289203472, + "kl": 0.71484375, + "learning_rate": 7.025032056921117e-07, + "loss": 0.0071, + "num_tokens": 174756650.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3816.0, + "completions/max_terminated_length": 131.0, + "completions/mean_length": 2984.3359375, + "completions/mean_terminated_length": 69.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 31.137931034482758, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2017257131941572, + "kl": 0.7939453125, + "learning_rate": 6.965034907965349e-07, + "loss": 0.0079, + "num_tokens": 175269717.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 12.0, + "completions/mean_length": 3485.9296875, + "completions/mean_terminated_length": 12.0, + "completions/min_length": 12.0, + "completions/min_terminated_length": 12.0, + "epoch": 31.20689655172414, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5546489851899966, + "kl": 0.865234375, + "learning_rate": 6.905899365017462e-07, + "loss": 0.0086, + "num_tokens": 175846988.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3644.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2905.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1389.0, + "completions/min_terminated_length": 0.0, + "epoch": 31.275862068965516, + "frac_reward_zero_std": 0.875, + "grad_norm": 12.66676442762852, + "kl": 0.802734375, + "learning_rate": 6.847627909286409e-07, + "loss": 0.008, + "num_tokens": 176349900.0, + "reward": 0.0031250000465661287, + "reward_std": 0.0088388342410326, + "rewards/code_format_reward/mean": 0.015625, + "rewards/code_format_reward/std": 0.12450689822435379, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3878.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2902.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1789.0, + "completions/min_terminated_length": 0.0, + "epoch": 31.344827586206897, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.22258308896262888, + "kl": 0.9619140625, + "learning_rate": 6.790222985725761e-07, + "loss": 0.0096, + "num_tokens": 176852260.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3192.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 2895.75, + "completions/mean_terminated_length": 63.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 31.413793103448278, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.20057791811079195, + "kl": 0.876953125, + "learning_rate": 6.733687002931141e-07, + "loss": 0.0088, + "num_tokens": 177353988.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4071.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 3624.9609375, + "completions/mean_terminated_length": 546.0, + "completions/min_length": 546.0, + "completions/min_terminated_length": 546.0, + "epoch": 31.482758620689655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.16691217765560545, + "kl": 0.8330078125, + "learning_rate": 6.678022333039158e-07, + "loss": 0.0083, + "num_tokens": 177949055.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3855.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3600.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2943.0, + "completions/min_terminated_length": 0.0, + "epoch": 31.551724137931036, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18536855255083134, + "kl": 0.7431640625, + "learning_rate": 6.623231311627876e-07, + "loss": 0.0074, + "num_tokens": 178539759.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3307.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2233.0, + "completions/min_terminated_length": 0.0, + "epoch": 31.620689655172413, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3504897136928905, + "kl": 0.7724609375, + "learning_rate": 6.569316237618811e-07, + "loss": 0.0077, + "num_tokens": 179094127.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3984.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3221.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2751.0, + "completions/min_terminated_length": 0.0, + "epoch": 31.689655172413794, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.6973466278935714, + "kl": 0.8564453125, + "learning_rate": 6.516279373180499e-07, + "loss": 0.0086, + "num_tokens": 179635983.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3974.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3638.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3184.0, + "completions/min_terminated_length": 0.0, + "epoch": 31.75862068965517, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.18025035415172505, + "kl": 0.751953125, + "learning_rate": 6.464122943633543e-07, + "loss": 0.0075, + "num_tokens": 180232783.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3966.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3818.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3522.0, + "completions/min_terminated_length": 0.0, + "epoch": 31.82758620689655, + "frac_reward_zero_std": 0.9375, + "grad_norm": 3.1358050811645803, + "kl": 0.833984375, + "learning_rate": 6.412849137357271e-07, + "loss": 0.0083, + "num_tokens": 180852591.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3961.0, + "completions/max_terminated_length": 152.0, + "completions/mean_length": 2886.921875, + "completions/mean_terminated_length": 100.0, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 31.896551724137932, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2948274001749933, + "kl": 0.923828125, + "learning_rate": 6.3624601056979e-07, + "loss": 0.0093, + "num_tokens": 181352021.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3620.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 3160.2421875, + "completions/mean_terminated_length": 209.0, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 31.96551724137931, + "frac_reward_zero_std": 0.875, + "grad_norm": 3.7274024380491744, + "kl": 1.32421875, + "learning_rate": 6.312957962878278e-07, + "loss": 0.0132, + "num_tokens": 181887604.0, + "reward": 0.0023437500931322575, + "reward_std": 0.00662912568077445, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3098.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2582.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2358.0, + "completions/min_terminated_length": 0.0, + "epoch": 32.06896551724138, + "frac_reward_zero_std": 1.0, + "grad_norm": 2.136426703722538, + "kl": 1.2822265625, + "learning_rate": 6.264344785909181e-07, + "loss": 0.0128, + "num_tokens": 182349268.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3458.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 3162.3046875, + "completions/mean_terminated_length": 1037.0, + "completions/min_length": 1037.0, + "completions/min_terminated_length": 1037.0, + "epoch": 32.13793103448276, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.2520983576156235, + "kl": 0.9404296875, + "learning_rate": 6.216622614502149e-07, + "loss": 0.0094, + "num_tokens": 182885115.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3262.0, + "completions/max_terminated_length": 2581.0, + "completions/mean_length": 2184.6640625, + "completions/mean_terminated_length": 1304.5, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "epoch": 32.206896551724135, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.366011224816465, + "kl": 0.95703125, + "learning_rate": 6.169793450983916e-07, + "loss": 0.0096, + "num_tokens": 183295824.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3567.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3170.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2860.0, + "completions/min_terminated_length": 0.0, + "epoch": 32.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8140210391828681, + "kl": 0.822265625, + "learning_rate": 6.123859260212393e-07, + "loss": 0.0082, + "num_tokens": 183832752.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3251.0, + "completions/max_terminated_length": 59.0, + "completions/mean_length": 2810.3203125, + "completions/mean_terminated_length": 59.0, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 32.3448275862069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.332293413639309, + "kl": 0.76171875, + "learning_rate": 6.07882196949423e-07, + "loss": 0.0076, + "num_tokens": 184323105.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3231.0, + "completions/max_terminated_length": 111.0, + "completions/mean_length": 3132.3125, + "completions/mean_terminated_length": 111.0, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 32.41379310344828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5224894407491708, + "kl": 0.8525390625, + "learning_rate": 6.034683468503948e-07, + "loss": 0.0085, + "num_tokens": 184855113.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3451.0, + "completions/max_terminated_length": 114.0, + "completions/mean_length": 2282.9375, + "completions/mean_terminated_length": 63.5, + "completions/min_length": 13.0, + "completions/min_terminated_length": 13.0, + "epoch": 32.48275862068966, + "frac_reward_zero_std": 0.9375, + "grad_norm": 11.166972360690067, + "kl": 1.13671875, + "learning_rate": 5.991445609204641e-07, + "loss": 0.0092, + "num_tokens": 185277305.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3301.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 2850.5859375, + "completions/mean_terminated_length": 81.0, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 32.55172413793103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.7828655061364351, + "kl": 0.9892578125, + "learning_rate": 5.949110205770292e-07, + "loss": 0.0099, + "num_tokens": 185772084.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3713.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3115.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2377.0, + "completions/min_terminated_length": 0.0, + "epoch": 32.62068965517241, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6840848825830977, + "kl": 0.8837890625, + "learning_rate": 5.90767903450964e-07, + "loss": 0.0088, + "num_tokens": 186301940.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3131.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 2390.8984375, + "completions/mean_terminated_length": 828.0, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "epoch": 32.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.48156033465185893, + "kl": 0.9599609375, + "learning_rate": 5.867153833791652e-07, + "loss": 0.0096, + "num_tokens": 186737647.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3578.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3291.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2993.0, + "completions/min_terminated_length": 0.0, + "epoch": 32.758620689655174, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5346660425103332, + "kl": 0.9501953125, + "learning_rate": 5.827536303972587e-07, + "loss": 0.0095, + "num_tokens": 187290031.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3472.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3027.0, + "completions/min_terminated_length": 0.0, + "epoch": 32.827586206896555, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2648468875740046, + "kl": 0.7978515625, + "learning_rate": 5.78882810732465e-07, + "loss": 0.008, + "num_tokens": 187864439.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3346.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2923.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2373.0, + "completions/min_terminated_length": 0.0, + "epoch": 32.89655172413793, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.4194738188643489, + "kl": 0.8076171875, + "learning_rate": 5.75103086796625e-07, + "loss": 0.0081, + "num_tokens": 188369719.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3460.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2908.0, + "completions/min_terminated_length": 0.0, + "epoch": 32.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.5584473553040513, + "kl": 0.833984375, + "learning_rate": 5.714146171793846e-07, + "loss": 0.0083, + "num_tokens": 188942767.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 3390.0, + "completions/max_terminated_length": 67.0, + "completions/mean_length": 2825.0703125, + "completions/mean_terminated_length": 28.666667938232422, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 33.06896551724138, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.2837544331197157, + "kl": 0.876953125, + "learning_rate": 5.678175566415422e-07, + "loss": 0.0088, + "num_tokens": 189435008.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.875, + "completions/max_length": 3617.0, + "completions/max_terminated_length": 209.0, + "completions/mean_length": 2804.75, + "completions/mean_terminated_length": 74.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 33.13793103448276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.8731926996051723, + "kl": 0.958984375, + "learning_rate": 5.643120561085528e-07, + "loss": 0.0096, + "num_tokens": 189923920.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3720.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3535.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3373.0, + "completions/min_terminated_length": 0.0, + "epoch": 33.206896551724135, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1566863300497092, + "kl": 1.0615234375, + "learning_rate": 5.608982626641991e-07, + "loss": 0.0106, + "num_tokens": 190507536.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4073.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3281.875, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 33.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6367413455232471, + "kl": 1.017578125, + "learning_rate": 5.575763195444166e-07, + "loss": 0.0102, + "num_tokens": 191058688.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2855.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1405.0, + "completions/min_terminated_length": 0.0, + "epoch": 33.3448275862069, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6884424033507358, + "kl": 1.095703125, + "learning_rate": 5.543463661312847e-07, + "loss": 0.011, + "num_tokens": 191555264.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3203.0, + "completions/max_terminated_length": 105.0, + "completions/mean_length": 2655.2265625, + "completions/mean_terminated_length": 105.0, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 33.41379310344828, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.11921707423536955, + "kl": 0.90625, + "learning_rate": 5.512085379471808e-07, + "loss": 0.009, + "num_tokens": 192026205.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3984.0, + "completions/max_terminated_length": 1131.0, + "completions/mean_length": 3307.9609375, + "completions/mean_terminated_length": 1131.0, + "completions/min_length": 1131.0, + "completions/min_terminated_length": 1131.0, + "epoch": 33.48275862068966, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.2074465134989551, + "kl": 0.8173828125, + "learning_rate": 5.481629666490903e-07, + "loss": 0.0082, + "num_tokens": 192579552.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3886.0, + "completions/max_terminated_length": 189.0, + "completions/mean_length": 3260.5234375, + "completions/mean_terminated_length": 119.5, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 33.55172413793103, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3890539429034864, + "kl": 0.853515625, + "learning_rate": 5.452097800230853e-07, + "loss": 0.0085, + "num_tokens": 193127739.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3829.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 2894.0, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2358.0, + "completions/min_terminated_length": 0.0, + "epoch": 33.62068965517241, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.875283802975271, + "kl": 0.9873046875, + "learning_rate": 5.423491019789623e-07, + "loss": 0.0098, + "num_tokens": 193629243.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 3444.3515625, + "completions/mean_terminated_length": 14.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 33.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.37285965772825935, + "kl": 0.857421875, + "learning_rate": 5.395810525450425e-07, + "loss": 0.0086, + "num_tokens": 194201192.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3937.0, + "completions/max_terminated_length": 177.0, + "completions/mean_length": 3250.1015625, + "completions/mean_terminated_length": 177.0, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 33.758620689655174, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.368582018342944, + "kl": 0.875, + "learning_rate": 5.369057478631359e-07, + "loss": 0.0088, + "num_tokens": 194748277.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3457.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2987.0, + "completions/min_terminated_length": 0.0, + "epoch": 33.827586206896555, + "frac_reward_zero_std": 0.875, + "grad_norm": 5.045523147578903, + "kl": 0.802734375, + "learning_rate": 5.343233001836694e-07, + "loss": 0.008, + "num_tokens": 195320781.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 3898.0, + "completions/max_terminated_length": 899.0, + "completions/mean_length": 3029.125, + "completions/mean_terminated_length": 330.3333435058594, + "completions/min_length": 19.0, + "completions/min_terminated_length": 19.0, + "epoch": 33.89655172413793, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.3221564952389173, + "kl": 0.912109375, + "learning_rate": 5.318338178609754e-07, + "loss": 0.0091, + "num_tokens": 195838413.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3656.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3382.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3204.0, + "completions/min_terminated_length": 0.0, + "epoch": 33.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.6415043501475972, + "kl": 0.7998046875, + "learning_rate": 5.294374053487459e-07, + "loss": 0.008, + "num_tokens": 196402477.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3654.0, + "completions/max_terminated_length": 126.0, + "completions/mean_length": 3087.46875, + "completions/mean_terminated_length": 126.0, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 34.06896551724138, + "frac_reward_zero_std": 0.9375, + "grad_norm": 2.564493390981493, + "kl": 0.94921875, + "learning_rate": 5.271341631956511e-07, + "loss": 0.0095, + "num_tokens": 196928745.0, + "reward": 0.0015625000232830644, + "reward_std": 0.002893187804147601, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3684.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 3392.625, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 34.13793103448276, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.937823822012556, + "kl": 0.8486328125, + "learning_rate": 5.249241880411181e-07, + "loss": 0.0085, + "num_tokens": 197494073.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3966.0, + "completions/max_terminated_length": 50.0, + "completions/mean_length": 3509.4140625, + "completions/mean_terminated_length": 50.0, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 34.206896551724135, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1941849080255262, + "kl": 0.8662109375, + "learning_rate": 5.228075726112785e-07, + "loss": 0.0087, + "num_tokens": 198074350.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3984.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3429.5, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2261.0, + "completions/min_terminated_length": 0.0, + "epoch": 34.275862068965516, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.368989622275372, + "kl": 1.0263671875, + "learning_rate": 5.207844057150768e-07, + "loss": 0.0103, + "num_tokens": 198644398.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3260.3125, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 34.3448275862069, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.6426625269815358, + "kl": 0.9677734375, + "learning_rate": 5.188547722405437e-07, + "loss": 0.0097, + "num_tokens": 199191886.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3973.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 3510.6640625, + "completions/mean_terminated_length": 244.0, + "completions/min_length": 244.0, + "completions/min_terminated_length": 244.0, + "epoch": 34.41379310344828, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1996.6623380906256, + "kl": 63.958984375, + "learning_rate": 5.170187531512351e-07, + "loss": 0.6407, + "num_tokens": 199772323.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 76.0, + "completions/mean_length": 3746.34375, + "completions/mean_terminated_length": 76.0, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 34.48275862068966, + "frac_reward_zero_std": 0.875, + "grad_norm": 5.607165443611961, + "kl": 1.0458984375, + "learning_rate": 5.152764254828348e-07, + "loss": 0.0104, + "num_tokens": 200380663.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3172.0, + "completions/max_terminated_length": 164.0, + "completions/mean_length": 2518.75, + "completions/mean_terminated_length": 164.0, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 34.55172413793103, + "frac_reward_zero_std": 0.9375, + "grad_norm": 6.292445339111056, + "kl": 1.099609375, + "learning_rate": 5.136278623399225e-07, + "loss": 0.011, + "num_tokens": 200832759.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3886.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 3176.1015625, + "completions/mean_terminated_length": 608.0, + "completions/min_length": 393.0, + "completions/min_terminated_length": 393.0, + "epoch": 34.62068965517241, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.8352783555090613, + "kl": 1.0361328125, + "learning_rate": 5.120731328929058e-07, + "loss": 0.0103, + "num_tokens": 201369204.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3933.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 3606.625, + "completions/mean_terminated_length": 781.0, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "epoch": 34.689655172413794, + "frac_reward_zero_std": 1.0, + "grad_norm": 1.1493563937634361, + "kl": 0.912109375, + "learning_rate": 5.106123023751187e-07, + "loss": 0.0091, + "num_tokens": 201961924.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4001.0, + "completions/max_terminated_length": 83.0, + "completions/mean_length": 3408.46875, + "completions/mean_terminated_length": 49.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 34.758620689655174, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.5486194926247445, + "kl": 1.109375, + "learning_rate": 5.092454320800833e-07, + "loss": 0.031, + "num_tokens": 202529280.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3826.0, + "completions/max_terminated_length": 55.0, + "completions/mean_length": 3523.1484375, + "completions/mean_terminated_length": 34.5, + "completions/min_length": 14.0, + "completions/min_terminated_length": 14.0, + "epoch": 34.827586206896555, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.3741479681169653, + "kl": 1.080078125, + "learning_rate": 5.079725793589405e-07, + "loss": 0.0092, + "num_tokens": 203110875.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3439.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3050.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 2591.0, + "completions/min_terminated_length": 0.0, + "epoch": 34.89655172413793, + "frac_reward_zero_std": 0.875, + "grad_norm": 2.7986592185842185, + "kl": 1.359375, + "learning_rate": 5.067937976180407e-07, + "loss": 0.0136, + "num_tokens": 203632379.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3817.0, + "completions/max_terminated_length": 192.0, + "completions/mean_length": 3290.6796875, + "completions/mean_terminated_length": 192.0, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "epoch": 34.96551724137931, + "frac_reward_zero_std": 1.0, + "grad_norm": 3.885159341324509, + "kl": 2.0, + "learning_rate": 5.057091363167046e-07, + "loss": 0.02, + "num_tokens": 204184658.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 4090.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3021.75, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 1211.0, + "completions/min_terminated_length": 0.0, + "epoch": 35.06896551724138, + "frac_reward_zero_std": 0.875, + "grad_norm": 4.282611410016724, + "kl": 1.685546875, + "learning_rate": 5.047186409651489e-07, + "loss": 0.0169, + "num_tokens": 204701370.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 14.0, + "completions/mean_length": 3723.5, + "completions/mean_terminated_length": 11.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 35.13793103448276, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.5373039872226522, + "kl": 2.14453125, + "learning_rate": 5.038223531225742e-07, + "loss": 0.0234, + "num_tokens": 205308146.0, + "reward": 0.0015625000232830644, + "reward_std": 0.0044194171205163, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.015625, + "rewards/format_reward/std": 0.12450689822435379, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.96875, + "completions/max_length": 3439.0, + "completions/max_terminated_length": 8.0, + "completions/mean_length": 3011.4609375, + "completions/mean_terminated_length": 8.0, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 35.206896551724135, + "frac_reward_zero_std": 0.9375, + "grad_norm": 4.841716912287862, + "kl": 3.078125, + "learning_rate": 5.030203103954232e-07, + "loss": 0.0308, + "num_tokens": 205824685.0, + "reward": 0.0007812500116415322, + "reward_std": 0.00220970856025815, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0078125, + "rewards/format_reward/std": 0.0883883461356163, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 4077.0, + "completions/max_terminated_length": 222.0, + "completions/mean_length": 3439.9609375, + "completions/mean_terminated_length": 81.33333587646484, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 35.275862068965516, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.575034152640209, + "kl": 3.66796875, + "learning_rate": 5.023125464358026e-07, + "loss": 0.0346, + "num_tokens": 206394536.0, + "reward": 0.00390625, + "reward_std": 0.009522313252091408, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.90625, + "completions/max_length": 4037.0, + "completions/max_terminated_length": 2761.0, + "completions/mean_length": 3622.9140625, + "completions/mean_terminated_length": 1451.0, + "completions/min_length": 675.0, + "completions/min_terminated_length": 675.0, + "epoch": 35.3448275862069, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.7179488310500055, + "kl": 3.53125, + "learning_rate": 5.016990909400709e-07, + "loss": 0.0366, + "num_tokens": 206989341.0, + "reward": 0.00390625, + "reward_std": 0.009522313252091408, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 1.0, + "completions/max_length": 3968.0, + "completions/max_terminated_length": 0.0, + "completions/mean_length": 3635.25, + "completions/mean_terminated_length": 0.0, + "completions/min_length": 3292.0, + "completions/min_terminated_length": 0.0, + "epoch": 35.41379310344828, + "frac_reward_zero_std": 0.5625, + "grad_norm": 20.553933911524943, + "kl": 4.58984375, + "learning_rate": 5.011799696475915e-07, + "loss": 0.046, + "num_tokens": 207585493.0, + "reward": 0.007031249813735485, + "reward_std": 0.018361147493124008, + "rewards/code_format_reward/mean": 0.0078125, + "rewards/code_format_reward/std": 0.0883883461356163, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.24301259219646454, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4021.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 3574.4140625, + "completions/mean_terminated_length": 127.5, + "completions/min_length": 8.0, + "completions/min_terminated_length": 8.0, + "epoch": 35.48275862068966, + "frac_reward_zero_std": 0.8125, + "grad_norm": 9.096320879858352, + "kl": 4.7734375, + "learning_rate": 5.007552043396547e-07, + "loss": 0.0469, + "num_tokens": 208174090.0, + "reward": 0.0023437500931322575, + "reward_std": 0.00662912568077445, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0234375, + "rewards/format_reward/std": 0.15188287198543549, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4029.0, + "completions/max_terminated_length": 15.0, + "completions/mean_length": 3392.75, + "completions/mean_terminated_length": 15.0, + "completions/min_length": 15.0, + "completions/min_terminated_length": 15.0, + "epoch": 35.55172413793103, + "frac_reward_zero_std": 0.6875, + "grad_norm": 4.488695050523634, + "kl": 6.3515625, + "learning_rate": 5.004248128385618e-07, + "loss": 0.0624, + "num_tokens": 208738266.0, + "reward": 0.0054687499068677425, + "reward_std": 0.012415501289069653, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.22826264798641205, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 3688.0234375, + "completions/mean_terminated_length": 524.5, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 35.62068965517241, + "frac_reward_zero_std": 0.75, + "grad_norm": 4.721507511320711, + "kl": 7.0859375, + "learning_rate": 5.001888090068784e-07, + "loss": 0.073, + "num_tokens": 209340237.0, + "reward": 0.00390625, + "reward_std": 0.009522313252091408, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0390625, + "rewards/format_reward/std": 0.194504976272583, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.9375, + "completions/max_length": 3696.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 3322.875, + "completions/mean_terminated_length": 278.0, + "completions/min_length": 11.0, + "completions/min_terminated_length": 11.0, + "epoch": 35.689655172413794, + "frac_reward_zero_std": 0.625, + "grad_norm": 3.936276706948606, + "kl": 6.3515625, + "learning_rate": 5.000472027468528e-07, + "loss": 0.0576, + "num_tokens": 209896637.0, + "reward": 0.0054687499068677425, + "reward_std": 0.013941730372607708, + "rewards/code_format_reward/mean": 0.0, + "rewards/code_format_reward/std": 0.0, + "rewards/format_reward/mean": 0.0546875, + "rewards/format_reward/std": 0.22826264798641205, + "rewards/ioi_code_reward/mean": 0.0, + "rewards/ioi_code_reward/std": 0.0, + "step": 500 + }, + { + "epoch": 35.689655172413794, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.09227038282398825, + "train_runtime": 24234.9154, + "train_samples_per_second": 2.641, + "train_steps_per_second": 0.021 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 209896637, + "num_train_epochs": 36, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}