{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.128, "eval_steps": 500, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00032, "frac_reward_zero_std": 0.25, "grad_norm": 0.8027206659317017, "kl": 1.71183273778297e-05, "learning_rate": 5e-06, "loss": 0.0, "num_tokens": 196560.0, "reward": 0.7936198115348816, "reward_std": 0.17962533235549927, "rewards/reward_len/mean": 0.7936197519302368, "rewards/reward_len/std": 0.3258915841579437, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00064, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7602783441543579, "kl": 0.00532680656760931, "learning_rate": 4.999999859632295e-06, "loss": 0.0, "num_tokens": 393040.0, "reward": 0.7620443105697632, "reward_std": 0.08783292770385742, "rewards/reward_len/mean": 0.7620443105697632, "rewards/reward_len/std": 0.3672178089618683, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00096, "frac_reward_zero_std": 0.5, "grad_norm": 0.7249953150749207, "kl": 0.005223212763667107, "learning_rate": 4.9999994385291934e-06, "loss": 0.0, "num_tokens": 589344.0, "reward": 0.8811849355697632, "reward_std": 0.13462764024734497, "rewards/reward_len/mean": 0.8811849355697632, "rewards/reward_len/std": 0.2868101894855499, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00128, "frac_reward_zero_std": 0.25, "grad_norm": 1.0998049974441528, "kl": 0.005302521400153637, "learning_rate": 4.9999987366907436e-06, "loss": 0.0, "num_tokens": 785824.0, "reward": 0.837890625, "reward_std": 0.17104606330394745, "rewards/reward_len/mean": 0.837890625, "rewards/reward_len/std": 0.274716854095459, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0016, "frac_reward_zero_std": 0.375, "grad_norm": 0.8081106543540955, "kl": 0.005471091717481613, "learning_rate": 4.999997754117024e-06, "loss": 0.0, "num_tokens": 982400.0, "reward": 0.7578125, "reward_std": 0.15783792734146118, "rewards/reward_len/mean": 0.7578125, "rewards/reward_len/std": 0.3556174635887146, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00192, "frac_reward_zero_std": 0.25, "grad_norm": 0.7931028008460999, "kl": 0.005383472889661789, "learning_rate": 4.999996490808146e-06, "loss": 0.0, "num_tokens": 1178736.0, "reward": 0.8411458730697632, "reward_std": 0.15851536393165588, "rewards/reward_len/mean": 0.8411458730697632, "rewards/reward_len/std": 0.2852468192577362, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00224, "frac_reward_zero_std": 0.375, "grad_norm": 0.855617880821228, "kl": 0.005938877817243338, "learning_rate": 4.9999949467642495e-06, "loss": 0.0, "num_tokens": 1375184.0, "reward": 0.7444661855697632, "reward_std": 0.18254899978637695, "rewards/reward_len/mean": 0.7444661855697632, "rewards/reward_len/std": 0.5324408411979675, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00256, "frac_reward_zero_std": 0.375, "grad_norm": 0.8006792068481445, "kl": 0.006116841919720173, "learning_rate": 4.999993121985509e-06, "loss": 0.0, "num_tokens": 1571472.0, "reward": 0.755859375, "reward_std": 0.14840292930603027, "rewards/reward_len/mean": 0.755859375, "rewards/reward_len/std": 0.41854918003082275, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00288, "frac_reward_zero_std": 0.625, "grad_norm": 0.5651379823684692, "kl": 0.006282718852162361, "learning_rate": 4.99999101647213e-06, "loss": 0.0, "num_tokens": 1767696.0, "reward": 0.8597005605697632, "reward_std": 0.09105785191059113, "rewards/reward_len/mean": 0.8597005605697632, "rewards/reward_len/std": 0.3063139021396637, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0032, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7746672630310059, "kl": 0.006277492269873619, "learning_rate": 4.9999886302243486e-06, "loss": 0.0, "num_tokens": 1964352.0, "reward": 0.736328125, "reward_std": 0.16170179843902588, "rewards/reward_len/mean": 0.736328125, "rewards/reward_len/std": 0.35940179228782654, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00352, "frac_reward_zero_std": 0.25, "grad_norm": 0.8158230185508728, "kl": 0.0064939698204398155, "learning_rate": 4.999985963242432e-06, "loss": 0.0, "num_tokens": 2160848.0, "reward": 0.7190755605697632, "reward_std": 0.14770188927650452, "rewards/reward_len/mean": 0.7190755605697632, "rewards/reward_len/std": 0.39762866497039795, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00384, "frac_reward_zero_std": 0.625, "grad_norm": 0.4778946042060852, "kl": 0.00701224897056818, "learning_rate": 4.99998301552668e-06, "loss": 0.0, "num_tokens": 2357440.0, "reward": 0.9007161855697632, "reward_std": 0.08900929242372513, "rewards/reward_len/mean": 0.9007161855697632, "rewards/reward_len/std": 0.2015477418899536, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00416, "frac_reward_zero_std": 0.5, "grad_norm": 0.7480363845825195, "kl": 0.006892648059874773, "learning_rate": 4.999979787077425e-06, "loss": 0.0, "num_tokens": 2553776.0, "reward": 0.736328125, "reward_std": 0.11627350747585297, "rewards/reward_len/mean": 0.736328125, "rewards/reward_len/std": 0.4063463509082794, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00448, "frac_reward_zero_std": 0.375, "grad_norm": 0.6913530826568604, "kl": 0.007525671273469925, "learning_rate": 4.9999762778950265e-06, "loss": 0.0, "num_tokens": 2750176.0, "reward": 0.8733724355697632, "reward_std": 0.13304749131202698, "rewards/reward_len/mean": 0.8733724355697632, "rewards/reward_len/std": 0.260817289352417, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0048, "frac_reward_zero_std": 0.3125, "grad_norm": 0.9006426334381104, "kl": 0.007139429450035095, "learning_rate": 4.999972487979882e-06, "loss": 0.0, "num_tokens": 2946432.0, "reward": 0.7353515625, "reward_std": 0.18793734908103943, "rewards/reward_len/mean": 0.7353515625, "rewards/reward_len/std": 0.4250172972679138, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00512, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7369216084480286, "kl": 0.007592486217617989, "learning_rate": 4.999968417332415e-06, "loss": 0.0, "num_tokens": 3142960.0, "reward": 0.8525390625, "reward_std": 0.13952995836734772, "rewards/reward_len/mean": 0.8525390625, "rewards/reward_len/std": 0.2791503369808197, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00544, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7958468198776245, "kl": 0.007764819078147411, "learning_rate": 4.999964065953083e-06, "loss": 0.0, "num_tokens": 3339504.0, "reward": 0.7796224355697632, "reward_std": 0.13519705832004547, "rewards/reward_len/mean": 0.7796224355697632, "rewards/reward_len/std": 0.2986285090446472, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00576, "frac_reward_zero_std": 0.0, "grad_norm": 0.8955850005149841, "kl": 0.008127257227897644, "learning_rate": 4.999959433842374e-06, "loss": 0.0, "num_tokens": 3536064.0, "reward": 0.6917318105697632, "reward_std": 0.22921673953533173, "rewards/reward_len/mean": 0.6917317509651184, "rewards/reward_len/std": 0.3818399906158447, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00608, "frac_reward_zero_std": 0.375, "grad_norm": 0.7979457974433899, "kl": 0.007617972791194916, "learning_rate": 4.999954521000811e-06, "loss": 0.0, "num_tokens": 3732256.0, "reward": 0.78515625, "reward_std": 0.1195596233010292, "rewards/reward_len/mean": 0.78515625, "rewards/reward_len/std": 0.32990217208862305, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0064, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7890306115150452, "kl": 0.007777344435453415, "learning_rate": 4.999949327428941e-06, "loss": 0.0, "num_tokens": 3928640.0, "reward": 0.7919921875, "reward_std": 0.17304885387420654, "rewards/reward_len/mean": 0.7919921875, "rewards/reward_len/std": 0.3526277244091034, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00672, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7711103558540344, "kl": 0.008527114056050777, "learning_rate": 4.999943853127351e-06, "loss": 0.0, "num_tokens": 4124784.0, "reward": 0.9065755605697632, "reward_std": 0.09724702686071396, "rewards/reward_len/mean": 0.9065755605697632, "rewards/reward_len/std": 0.252506285905838, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00704, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7866696119308472, "kl": 0.008756128139793873, "learning_rate": 4.999938098096655e-06, "loss": 0.0, "num_tokens": 4321184.0, "reward": 0.8844401240348816, "reward_std": 0.1446431577205658, "rewards/reward_len/mean": 0.8844401240348816, "rewards/reward_len/std": 0.2464737743139267, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00736, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8422706723213196, "kl": 0.009292546659708023, "learning_rate": 4.999932062337498e-06, "loss": 0.0, "num_tokens": 4517488.0, "reward": 0.9410807490348816, "reward_std": 0.12082913517951965, "rewards/reward_len/mean": 0.9410807490348816, "rewards/reward_len/std": 0.16378451883792877, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00768, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7936801314353943, "kl": 0.009337708353996277, "learning_rate": 4.999925745850559e-06, "loss": 0.0, "num_tokens": 4713888.0, "reward": 0.7981771230697632, "reward_std": 0.1319822520017624, "rewards/reward_len/mean": 0.7981771230697632, "rewards/reward_len/std": 0.29971110820770264, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.008, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8625982403755188, "kl": 0.009664739482104778, "learning_rate": 4.999919148636547e-06, "loss": 0.0, "num_tokens": 4910480.0, "reward": 0.7828776240348816, "reward_std": 0.15538470447063446, "rewards/reward_len/mean": 0.7828776240348816, "rewards/reward_len/std": 0.3985660672187805, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00832, "frac_reward_zero_std": 0.1875, "grad_norm": 0.7866320013999939, "kl": 0.009115578606724739, "learning_rate": 4.999912270696202e-06, "loss": 0.0, "num_tokens": 5107040.0, "reward": 0.837890625, "reward_std": 0.19406211376190186, "rewards/reward_len/mean": 0.837890625, "rewards/reward_len/std": 0.28484001755714417, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00864, "frac_reward_zero_std": 0.25, "grad_norm": 0.7605833411216736, "kl": 0.010722242295742035, "learning_rate": 4.999905112030298e-06, "loss": 0.0, "num_tokens": 5303472.0, "reward": 0.8326823115348816, "reward_std": 0.16619378328323364, "rewards/reward_len/mean": 0.8326823115348816, "rewards/reward_len/std": 0.3326784670352936, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00896, "frac_reward_zero_std": 0.375, "grad_norm": 0.8247683048248291, "kl": 0.010293405503034592, "learning_rate": 4.999897672639636e-06, "loss": 0.0, "num_tokens": 5499968.0, "reward": 0.7662760019302368, "reward_std": 0.13208766281604767, "rewards/reward_len/mean": 0.7662760615348816, "rewards/reward_len/std": 0.3626876771450043, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00928, "frac_reward_zero_std": 0.1875, "grad_norm": 0.9354327917098999, "kl": 0.010291656479239464, "learning_rate": 4.9998899525250556e-06, "loss": 0.0, "num_tokens": 5696496.0, "reward": 0.6142578125, "reward_std": 0.2659006714820862, "rewards/reward_len/mean": 0.6142578125, "rewards/reward_len/std": 0.5169708728790283, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0096, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7617059946060181, "kl": 0.011677569709718227, "learning_rate": 4.99988195168742e-06, "loss": 0.0, "num_tokens": 5892672.0, "reward": 0.818359375, "reward_std": 0.16987799108028412, "rewards/reward_len/mean": 0.818359375, "rewards/reward_len/std": 0.2849595248699188, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.00992, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6369988322257996, "kl": 0.011780554428696632, "learning_rate": 4.99987367012763e-06, "loss": 0.0, "num_tokens": 6089264.0, "reward": 0.8307291865348816, "reward_std": 0.15030531585216522, "rewards/reward_len/mean": 0.8307291865348816, "rewards/reward_len/std": 0.3026638329029083, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01024, "frac_reward_zero_std": 0.625, "grad_norm": 0.5698184370994568, "kl": 0.011647619307041168, "learning_rate": 4.9998651078466144e-06, "loss": 0.0, "num_tokens": 6285792.0, "reward": 0.8603515625, "reward_std": 0.06890285015106201, "rewards/reward_len/mean": 0.8603515625, "rewards/reward_len/std": 0.3024090826511383, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01056, "frac_reward_zero_std": 0.625, "grad_norm": 0.7035416960716248, "kl": 0.01283220387995243, "learning_rate": 4.999856264845334e-06, "loss": 0.0, "num_tokens": 6482272.0, "reward": 0.8284505605697632, "reward_std": 0.0647423267364502, "rewards/reward_len/mean": 0.8284505605697632, "rewards/reward_len/std": 0.2645607888698578, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01088, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7955151200294495, "kl": 0.013212130405008793, "learning_rate": 4.999847141124784e-06, "loss": 0.0, "num_tokens": 6678928.0, "reward": 0.9065755605697632, "reward_std": 0.15104928612709045, "rewards/reward_len/mean": 0.9065755605697632, "rewards/reward_len/std": 0.2394418716430664, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0112, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7888926267623901, "kl": 0.01455118041485548, "learning_rate": 4.999837736685987e-06, "loss": 0.0, "num_tokens": 6875680.0, "reward": 0.8001302480697632, "reward_std": 0.18624618649482727, "rewards/reward_len/mean": 0.8001302480697632, "rewards/reward_len/std": 0.3228481709957123, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01152, "frac_reward_zero_std": 0.375, "grad_norm": 0.7999125719070435, "kl": 0.016429290175437927, "learning_rate": 4.9998280515300006e-06, "loss": 0.0, "num_tokens": 7071792.0, "reward": 0.7805989980697632, "reward_std": 0.10644528269767761, "rewards/reward_len/mean": 0.7805989980697632, "rewards/reward_len/std": 0.3641582727432251, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01184, "frac_reward_zero_std": 0.1875, "grad_norm": 0.8360075950622559, "kl": 0.01572563126683235, "learning_rate": 4.999818085657911e-06, "loss": 0.0, "num_tokens": 7268400.0, "reward": 0.8115234375, "reward_std": 0.19910216331481934, "rewards/reward_len/mean": 0.8115234375, "rewards/reward_len/std": 0.3251221179962158, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01216, "frac_reward_zero_std": 0.25, "grad_norm": 0.8303148746490479, "kl": 0.015369746834039688, "learning_rate": 4.9998078390708375e-06, "loss": 0.0, "num_tokens": 7464832.0, "reward": 0.7236328125, "reward_std": 0.20254287123680115, "rewards/reward_len/mean": 0.7236328125, "rewards/reward_len/std": 0.49883776903152466, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01248, "frac_reward_zero_std": 0.25, "grad_norm": 0.8379618525505066, "kl": 0.015162697061896324, "learning_rate": 4.999797311769932e-06, "loss": 0.0, "num_tokens": 7661264.0, "reward": 0.8512369990348816, "reward_std": 0.1815636157989502, "rewards/reward_len/mean": 0.8512369990348816, "rewards/reward_len/std": 0.30639442801475525, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0128, "frac_reward_zero_std": 0.25, "grad_norm": 0.9366811513900757, "kl": 0.01498054526746273, "learning_rate": 4.999786503756376e-06, "loss": 0.0, "num_tokens": 7857696.0, "reward": 0.8359375, "reward_std": 0.14262336492538452, "rewards/reward_len/mean": 0.8359375, "rewards/reward_len/std": 0.29101234674453735, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01312, "frac_reward_zero_std": 0.125, "grad_norm": 0.8755704164505005, "kl": 0.017075425013899803, "learning_rate": 4.999775415031381e-06, "loss": 0.0, "num_tokens": 8054128.0, "reward": 0.8531901240348816, "reward_std": 0.19629715383052826, "rewards/reward_len/mean": 0.8531901240348816, "rewards/reward_len/std": 0.29085859656333923, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01344, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7478809356689453, "kl": 0.016752440482378006, "learning_rate": 4.999764045596195e-06, "loss": 0.0, "num_tokens": 8250448.0, "reward": 0.8297526240348816, "reward_std": 0.10785592347383499, "rewards/reward_len/mean": 0.8297526240348816, "rewards/reward_len/std": 0.3374955654144287, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01376, "frac_reward_zero_std": 0.375, "grad_norm": 0.79532790184021, "kl": 0.01613219454884529, "learning_rate": 4.999752395452095e-06, "loss": 0.0, "num_tokens": 8446864.0, "reward": 0.798828125, "reward_std": 0.15989980101585388, "rewards/reward_len/mean": 0.798828125, "rewards/reward_len/std": 0.32186606526374817, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01408, "frac_reward_zero_std": 0.4375, "grad_norm": 0.757718563079834, "kl": 0.016810301691293716, "learning_rate": 4.999740464600386e-06, "loss": 0.0, "num_tokens": 8643536.0, "reward": 0.8186849355697632, "reward_std": 0.14407595992088318, "rewards/reward_len/mean": 0.8186849355697632, "rewards/reward_len/std": 0.3191888630390167, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0144, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6717040538787842, "kl": 0.019070003181695938, "learning_rate": 4.9997282530424114e-06, "loss": 0.0, "num_tokens": 8839776.0, "reward": 0.8984375, "reward_std": 0.15830302238464355, "rewards/reward_len/mean": 0.8984375, "rewards/reward_len/std": 0.24285823106765747, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01472, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8255589008331299, "kl": 0.01899189129471779, "learning_rate": 4.999715760779541e-06, "loss": 0.0, "num_tokens": 9036368.0, "reward": 0.7731119990348816, "reward_std": 0.15324297547340393, "rewards/reward_len/mean": 0.7731119394302368, "rewards/reward_len/std": 0.3341211676597595, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01504, "frac_reward_zero_std": 0.125, "grad_norm": 0.9899515509605408, "kl": 0.018817655742168427, "learning_rate": 4.9997029878131776e-06, "loss": 0.0, "num_tokens": 9232672.0, "reward": 0.7311198115348816, "reward_std": 0.198727086186409, "rewards/reward_len/mean": 0.7311197519302368, "rewards/reward_len/std": 0.39068886637687683, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01536, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7901699542999268, "kl": 0.017967861145734787, "learning_rate": 4.999689934144754e-06, "loss": 0.0, "num_tokens": 9429104.0, "reward": 0.8727213740348816, "reward_std": 0.14133989810943604, "rewards/reward_len/mean": 0.8727213144302368, "rewards/reward_len/std": 0.27691739797592163, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01568, "frac_reward_zero_std": 0.25, "grad_norm": 0.8071075677871704, "kl": 0.01948385313153267, "learning_rate": 4.99967659977574e-06, "loss": 0.0, "num_tokens": 9625744.0, "reward": 0.8323568105697632, "reward_std": 0.1779971718788147, "rewards/reward_len/mean": 0.8323567509651184, "rewards/reward_len/std": 0.301727294921875, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.016, "frac_reward_zero_std": 0.25, "grad_norm": 0.7960187792778015, "kl": 0.019299190491437912, "learning_rate": 4.999662984707629e-06, "loss": 0.0, "num_tokens": 9822208.0, "reward": 0.7916666865348816, "reward_std": 0.17278119921684265, "rewards/reward_len/mean": 0.7916666865348816, "rewards/reward_len/std": 0.3409265875816345, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01632, "frac_reward_zero_std": 0.3125, "grad_norm": 0.815869152545929, "kl": 0.01802164316177368, "learning_rate": 4.999649088941951e-06, "loss": 0.0, "num_tokens": 10018832.0, "reward": 0.7574869990348816, "reward_std": 0.16599968075752258, "rewards/reward_len/mean": 0.7574869990348816, "rewards/reward_len/std": 0.3090384006500244, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01664, "frac_reward_zero_std": 0.25, "grad_norm": 0.824347734451294, "kl": 0.020238451659679413, "learning_rate": 4.999634912480268e-06, "loss": 0.0, "num_tokens": 10215008.0, "reward": 0.8421224355697632, "reward_std": 0.14578115940093994, "rewards/reward_len/mean": 0.8421224355697632, "rewards/reward_len/std": 0.27820974588394165, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01696, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7971190810203552, "kl": 0.021430665627121925, "learning_rate": 4.99962045532417e-06, "loss": 0.0, "num_tokens": 10411328.0, "reward": 0.90625, "reward_std": 0.1278911530971527, "rewards/reward_len/mean": 0.90625, "rewards/reward_len/std": 0.22140371799468994, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01728, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7196866273880005, "kl": 0.02385888248682022, "learning_rate": 4.999605717475281e-06, "loss": 0.0, "num_tokens": 10607744.0, "reward": 0.7164713144302368, "reward_std": 0.1332741230726242, "rewards/reward_len/mean": 0.7164713740348816, "rewards/reward_len/std": 0.4236203730106354, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0176, "frac_reward_zero_std": 0.5, "grad_norm": 0.694108247756958, "kl": 0.019158754497766495, "learning_rate": 4.999590698935257e-06, "loss": 0.0, "num_tokens": 10804432.0, "reward": 0.83984375, "reward_std": 0.07713833451271057, "rewards/reward_len/mean": 0.83984375, "rewards/reward_len/std": 0.3475888669490814, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01792, "frac_reward_zero_std": 0.375, "grad_norm": 0.7138301730155945, "kl": 0.021222909912467003, "learning_rate": 4.999575399705782e-06, "loss": 0.0, "num_tokens": 11000960.0, "reward": 0.8362630605697632, "reward_std": 0.13972680270671844, "rewards/reward_len/mean": 0.8362630605697632, "rewards/reward_len/std": 0.3137500584125519, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01824, "frac_reward_zero_std": 0.5, "grad_norm": 0.5983142852783203, "kl": 0.023208746686577797, "learning_rate": 4.999559819788578e-06, "loss": 0.0, "num_tokens": 11197456.0, "reward": 0.8756510615348816, "reward_std": 0.09809666872024536, "rewards/reward_len/mean": 0.8756510615348816, "rewards/reward_len/std": 0.27435725927352905, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01856, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7307888865470886, "kl": 0.021232981234788895, "learning_rate": 4.999543959185391e-06, "loss": 0.0, "num_tokens": 11394064.0, "reward": 0.8444010615348816, "reward_std": 0.11457288265228271, "rewards/reward_len/mean": 0.8444010019302368, "rewards/reward_len/std": 0.28051677346229553, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01888, "frac_reward_zero_std": 0.375, "grad_norm": 0.813261866569519, "kl": 0.02231256291270256, "learning_rate": 4.999527817898004e-06, "loss": 0.0, "num_tokens": 11590608.0, "reward": 0.853515625, "reward_std": 0.17535921931266785, "rewards/reward_len/mean": 0.853515625, "rewards/reward_len/std": 0.4461461901664734, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0192, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7521728873252869, "kl": 0.022967170923948288, "learning_rate": 4.999511395928228e-06, "loss": 0.0, "num_tokens": 11787168.0, "reward": 0.701171875, "reward_std": 0.15171362459659576, "rewards/reward_len/mean": 0.701171875, "rewards/reward_len/std": 0.4368668794631958, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01952, "frac_reward_zero_std": 0.375, "grad_norm": 0.8079439997673035, "kl": 0.023756055161356926, "learning_rate": 4.9994946932779076e-06, "loss": 0.0, "num_tokens": 11983168.0, "reward": 0.9052734375, "reward_std": 0.11953707039356232, "rewards/reward_len/mean": 0.9052734375, "rewards/reward_len/std": 0.23069322109222412, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.01984, "frac_reward_zero_std": 0.5, "grad_norm": 0.6609861850738525, "kl": 0.02364611253142357, "learning_rate": 4.99947770994892e-06, "loss": 0.0, "num_tokens": 12179648.0, "reward": 0.912109375, "reward_std": 0.124283567070961, "rewards/reward_len/mean": 0.912109375, "rewards/reward_len/std": 0.2015216052532196, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02016, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6409368515014648, "kl": 0.02544046938419342, "learning_rate": 4.999460445943169e-06, "loss": 0.0001, "num_tokens": 12375952.0, "reward": 0.7766927480697632, "reward_std": 0.15531742572784424, "rewards/reward_len/mean": 0.7766927480697632, "rewards/reward_len/std": 0.3684055209159851, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02048, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6801385283470154, "kl": 0.02245226688683033, "learning_rate": 4.999442901262598e-06, "loss": 0.0, "num_tokens": 12572432.0, "reward": 0.9143880605697632, "reward_std": 0.09244164079427719, "rewards/reward_len/mean": 0.9143880605697632, "rewards/reward_len/std": 0.18908163905143738, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0208, "frac_reward_zero_std": 0.5, "grad_norm": 0.688310980796814, "kl": 0.025234345346689224, "learning_rate": 4.9994250759091725e-06, "loss": 0.0001, "num_tokens": 12768864.0, "reward": 0.8714193105697632, "reward_std": 0.11069254577159882, "rewards/reward_len/mean": 0.8714193105697632, "rewards/reward_len/std": 0.2919362485408783, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02112, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6891856789588928, "kl": 0.023964237421751022, "learning_rate": 4.999406969884897e-06, "loss": 0.0, "num_tokens": 12965328.0, "reward": 0.8665364980697632, "reward_std": 0.1188942939043045, "rewards/reward_len/mean": 0.8665364980697632, "rewards/reward_len/std": 0.26203906536102295, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02144, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5637570023536682, "kl": 0.026914432644844055, "learning_rate": 4.9993885831918035e-06, "loss": 0.0001, "num_tokens": 13161712.0, "reward": 0.8720703125, "reward_std": 0.09208361804485321, "rewards/reward_len/mean": 0.8720703125, "rewards/reward_len/std": 0.2989489436149597, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02176, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5956972241401672, "kl": 0.026400156319141388, "learning_rate": 4.999369915831958e-06, "loss": 0.0001, "num_tokens": 13358048.0, "reward": 0.7919921875, "reward_std": 0.08180129528045654, "rewards/reward_len/mean": 0.7919921875, "rewards/reward_len/std": 0.3576885163784027, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02208, "frac_reward_zero_std": 0.625, "grad_norm": 0.5297041535377502, "kl": 0.02976173348724842, "learning_rate": 4.999350967807455e-06, "loss": 0.0001, "num_tokens": 13554560.0, "reward": 0.9059244990348816, "reward_std": 0.08187607675790787, "rewards/reward_len/mean": 0.9059244990348816, "rewards/reward_len/std": 0.20148861408233643, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0224, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5297337174415588, "kl": 0.029380029067397118, "learning_rate": 4.999331739120423e-06, "loss": 0.0001, "num_tokens": 13750992.0, "reward": 0.8766276240348816, "reward_std": 0.08794420212507248, "rewards/reward_len/mean": 0.8766276240348816, "rewards/reward_len/std": 0.24631834030151367, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02272, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5374413728713989, "kl": 0.03178559988737106, "learning_rate": 4.999312229773022e-06, "loss": 0.0001, "num_tokens": 13947232.0, "reward": 0.96484375, "reward_std": 0.07383135706186295, "rewards/reward_len/mean": 0.96484375, "rewards/reward_len/std": 0.17232529819011688, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02304, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6321916580200195, "kl": 0.02617986872792244, "learning_rate": 4.9992924397674414e-06, "loss": 0.0001, "num_tokens": 14143712.0, "reward": 0.9111328125, "reward_std": 0.08529947698116302, "rewards/reward_len/mean": 0.9111328125, "rewards/reward_len/std": 0.2390221208333969, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02336, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8528509140014648, "kl": 0.029244577512145042, "learning_rate": 4.999272369105904e-06, "loss": 0.0001, "num_tokens": 14340208.0, "reward": 0.8619791865348816, "reward_std": 0.1358361542224884, "rewards/reward_len/mean": 0.8619791269302368, "rewards/reward_len/std": 0.3006777763366699, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02368, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7068591117858887, "kl": 0.038017794489860535, "learning_rate": 4.999252017790665e-06, "loss": 0.0001, "num_tokens": 14536384.0, "reward": 0.8388671875, "reward_std": 0.126865953207016, "rewards/reward_len/mean": 0.8388671875, "rewards/reward_len/std": 0.2714577317237854, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.024, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7906740307807922, "kl": 0.035286713391542435, "learning_rate": 4.999231385824008e-06, "loss": 0.0001, "num_tokens": 14732848.0, "reward": 0.8375651240348816, "reward_std": 0.20768919587135315, "rewards/reward_len/mean": 0.8375651240348816, "rewards/reward_len/std": 0.327578604221344, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02432, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7323654890060425, "kl": 0.029418617486953735, "learning_rate": 4.99921047320825e-06, "loss": 0.0001, "num_tokens": 14929392.0, "reward": 0.8792318105697632, "reward_std": 0.109856978058815, "rewards/reward_len/mean": 0.8792318105697632, "rewards/reward_len/std": 0.28465378284454346, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02464, "frac_reward_zero_std": 0.5, "grad_norm": 0.7487607598304749, "kl": 0.02889881655573845, "learning_rate": 4.999189279945741e-06, "loss": 0.0001, "num_tokens": 15125728.0, "reward": 0.8938802480697632, "reward_std": 0.10382238030433655, "rewards/reward_len/mean": 0.8938801884651184, "rewards/reward_len/std": 0.26159700751304626, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02496, "frac_reward_zero_std": 0.5, "grad_norm": 0.7281383275985718, "kl": 0.03049377351999283, "learning_rate": 4.999167806038858e-06, "loss": 0.0001, "num_tokens": 15322032.0, "reward": 0.8343099355697632, "reward_std": 0.14393801987171173, "rewards/reward_len/mean": 0.8343099355697632, "rewards/reward_len/std": 0.3518146574497223, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02528, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6423137784004211, "kl": 0.03136272728443146, "learning_rate": 4.999146051490016e-06, "loss": 0.0001, "num_tokens": 15518320.0, "reward": 0.9407552480697632, "reward_std": 0.0995391458272934, "rewards/reward_len/mean": 0.9407551884651184, "rewards/reward_len/std": 0.24368993937969208, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0256, "frac_reward_zero_std": 0.375, "grad_norm": 0.8017213940620422, "kl": 0.03368952497839928, "learning_rate": 4.999124016301654e-06, "loss": 0.0001, "num_tokens": 15714848.0, "reward": 0.857421875, "reward_std": 0.14494842290878296, "rewards/reward_len/mean": 0.857421875, "rewards/reward_len/std": 0.286716103553772, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02592, "frac_reward_zero_std": 0.25, "grad_norm": 0.8540568947792053, "kl": 0.03586439788341522, "learning_rate": 4.99910170047625e-06, "loss": 0.0001, "num_tokens": 15911296.0, "reward": 0.8414713740348816, "reward_std": 0.18263337016105652, "rewards/reward_len/mean": 0.8414713740348816, "rewards/reward_len/std": 0.3021599352359772, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02624, "frac_reward_zero_std": 0.25, "grad_norm": 0.8202486038208008, "kl": 0.04007921367883682, "learning_rate": 4.999079104016308e-06, "loss": 0.0001, "num_tokens": 16107648.0, "reward": 0.8688151240348816, "reward_std": 0.1747693419456482, "rewards/reward_len/mean": 0.8688150644302368, "rewards/reward_len/std": 0.2946781814098358, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02656, "frac_reward_zero_std": 0.625, "grad_norm": 0.697175920009613, "kl": 0.03438268229365349, "learning_rate": 4.999056226924366e-06, "loss": 0.0001, "num_tokens": 16304064.0, "reward": 0.9052734375, "reward_std": 0.10430242866277695, "rewards/reward_len/mean": 0.9052734375, "rewards/reward_len/std": 0.24809814989566803, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02688, "frac_reward_zero_std": 0.625, "grad_norm": 0.6200621724128723, "kl": 0.030285635963082314, "learning_rate": 4.999033069202992e-06, "loss": 0.0001, "num_tokens": 16500336.0, "reward": 0.9410807490348816, "reward_std": 0.08145523071289062, "rewards/reward_len/mean": 0.9410807490348816, "rewards/reward_len/std": 0.16886034607887268, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0272, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6632947325706482, "kl": 0.03737508878111839, "learning_rate": 4.999009630854787e-06, "loss": 0.0001, "num_tokens": 16696880.0, "reward": 0.8681640625, "reward_std": 0.09984727203845978, "rewards/reward_len/mean": 0.8681640625, "rewards/reward_len/std": 0.3139059841632843, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02752, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5139328837394714, "kl": 0.033338725566864014, "learning_rate": 4.998985911882383e-06, "loss": 0.0001, "num_tokens": 16893424.0, "reward": 0.9231771230697632, "reward_std": 0.064088836312294, "rewards/reward_len/mean": 0.9231771230697632, "rewards/reward_len/std": 0.20548295974731445, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02784, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7163829803466797, "kl": 0.04579077661037445, "learning_rate": 4.998961912288445e-06, "loss": 0.0001, "num_tokens": 17089792.0, "reward": 0.9013671875, "reward_std": 0.14256593585014343, "rewards/reward_len/mean": 0.9013671875, "rewards/reward_len/std": 0.2612590491771698, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02816, "frac_reward_zero_std": 0.625, "grad_norm": 0.6266374588012695, "kl": 0.03986838087439537, "learning_rate": 4.998937632075667e-06, "loss": 0.0001, "num_tokens": 17285984.0, "reward": 0.9016927480697632, "reward_std": 0.08280540257692337, "rewards/reward_len/mean": 0.9016926884651184, "rewards/reward_len/std": 0.24128234386444092, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02848, "frac_reward_zero_std": 0.5, "grad_norm": 0.5966895222663879, "kl": 0.033690571784973145, "learning_rate": 4.998913071246774e-06, "loss": 0.0001, "num_tokens": 17482432.0, "reward": 0.9534505605697632, "reward_std": 0.08552976697683334, "rewards/reward_len/mean": 0.9534505009651184, "rewards/reward_len/std": 0.1531372368335724, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0288, "frac_reward_zero_std": 0.5, "grad_norm": 0.6456487774848938, "kl": 0.06617207825183868, "learning_rate": 4.998888229804526e-06, "loss": 0.0001, "num_tokens": 17678768.0, "reward": 0.8577474355697632, "reward_std": 0.10766549408435822, "rewards/reward_len/mean": 0.8577474355697632, "rewards/reward_len/std": 0.3121115267276764, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02912, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6451205015182495, "kl": 0.03554663062095642, "learning_rate": 4.998863107751711e-06, "loss": 0.0001, "num_tokens": 17875056.0, "reward": 0.8850911855697632, "reward_std": 0.09045256674289703, "rewards/reward_len/mean": 0.8850911855697632, "rewards/reward_len/std": 0.2813292443752289, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02944, "frac_reward_zero_std": 0.5, "grad_norm": 0.6420301795005798, "kl": 0.03609941899776459, "learning_rate": 4.998837705091152e-06, "loss": 0.0001, "num_tokens": 18071504.0, "reward": 0.9020182490348816, "reward_std": 0.12863239645957947, "rewards/reward_len/mean": 0.9020181894302368, "rewards/reward_len/std": 0.2481461763381958, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.02976, "frac_reward_zero_std": 0.4375, "grad_norm": 0.744374692440033, "kl": 0.03676271066069603, "learning_rate": 4.9988120218257e-06, "loss": 0.0001, "num_tokens": 18267712.0, "reward": 0.8391927480697632, "reward_std": 0.13910341262817383, "rewards/reward_len/mean": 0.8391926884651184, "rewards/reward_len/std": 0.33368995785713196, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03008, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6814514398574829, "kl": 0.04908795654773712, "learning_rate": 4.99878605795824e-06, "loss": 0.0001, "num_tokens": 18464272.0, "reward": 0.8059896230697632, "reward_std": 0.10761559009552002, "rewards/reward_len/mean": 0.8059896230697632, "rewards/reward_len/std": 0.3403044641017914, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0304, "frac_reward_zero_std": 0.5, "grad_norm": 0.6609547138214111, "kl": 0.04120853543281555, "learning_rate": 4.998759813491687e-06, "loss": 0.0001, "num_tokens": 18660592.0, "reward": 0.8746744990348816, "reward_std": 0.12070801854133606, "rewards/reward_len/mean": 0.8746744990348816, "rewards/reward_len/std": 0.3313665986061096, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03072, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6958045959472656, "kl": 0.051205702126026154, "learning_rate": 4.998733288428987e-06, "loss": 0.0001, "num_tokens": 18857088.0, "reward": 0.8590494990348816, "reward_std": 0.12743408977985382, "rewards/reward_len/mean": 0.8590494990348816, "rewards/reward_len/std": 0.2551549971103668, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03104, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5829638838768005, "kl": 0.03792741894721985, "learning_rate": 4.998706482773121e-06, "loss": 0.0001, "num_tokens": 19053536.0, "reward": 0.8645833730697632, "reward_std": 0.08903007954359055, "rewards/reward_len/mean": 0.8645833134651184, "rewards/reward_len/std": 0.27937448024749756, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03136, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6745487451553345, "kl": 0.04386503994464874, "learning_rate": 4.998679396527099e-06, "loss": 0.0001, "num_tokens": 19250192.0, "reward": 0.8658854365348816, "reward_std": 0.11358411610126495, "rewards/reward_len/mean": 0.8658853769302368, "rewards/reward_len/std": 0.21724016964435577, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03168, "frac_reward_zero_std": 0.375, "grad_norm": 0.8316110968589783, "kl": 0.04867399483919144, "learning_rate": 4.99865202969396e-06, "loss": 0.0001, "num_tokens": 19446624.0, "reward": 0.8932291865348816, "reward_std": 0.11117450892925262, "rewards/reward_len/mean": 0.8932291865348816, "rewards/reward_len/std": 0.20754754543304443, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.032, "frac_reward_zero_std": 0.125, "grad_norm": 0.9266502857208252, "kl": 0.04411531612277031, "learning_rate": 4.9986243822767795e-06, "loss": 0.0001, "num_tokens": 19643296.0, "reward": 0.8235677480697632, "reward_std": 0.17513221502304077, "rewards/reward_len/mean": 0.8235677480697632, "rewards/reward_len/std": 0.28480416536331177, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03232, "frac_reward_zero_std": 0.625, "grad_norm": 0.5483343601226807, "kl": 0.04683082178235054, "learning_rate": 4.998596454278661e-06, "loss": 0.0001, "num_tokens": 19839744.0, "reward": 0.8821614980697632, "reward_std": 0.17442311346530914, "rewards/reward_len/mean": 0.8821614980697632, "rewards/reward_len/std": 0.4825977385044098, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03264, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8815144896507263, "kl": 0.04954490810632706, "learning_rate": 4.9985682457027415e-06, "loss": 0.0001, "num_tokens": 20036064.0, "reward": 0.7447916865348816, "reward_std": 0.1432168036699295, "rewards/reward_len/mean": 0.7447916865348816, "rewards/reward_len/std": 0.3738817870616913, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03296, "frac_reward_zero_std": 0.5, "grad_norm": 0.6654059290885925, "kl": 0.04038938879966736, "learning_rate": 4.998539756552188e-06, "loss": 0.0001, "num_tokens": 20232656.0, "reward": 0.8264974355697632, "reward_std": 0.09185890108346939, "rewards/reward_len/mean": 0.8264973759651184, "rewards/reward_len/std": 0.3214074671268463, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03328, "frac_reward_zero_std": 0.625, "grad_norm": 0.7029321789741516, "kl": 0.05030888319015503, "learning_rate": 4.998510986830199e-06, "loss": 0.0001, "num_tokens": 20429360.0, "reward": 0.8544921875, "reward_std": 0.07268257439136505, "rewards/reward_len/mean": 0.8544921875, "rewards/reward_len/std": 0.29825639724731445, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0336, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6499698162078857, "kl": 0.042537011206150055, "learning_rate": 4.998481936540007e-06, "loss": 0.0001, "num_tokens": 20625760.0, "reward": 0.921875, "reward_std": 0.10531863570213318, "rewards/reward_len/mean": 0.921875, "rewards/reward_len/std": 0.19435104727745056, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03392, "frac_reward_zero_std": 0.5, "grad_norm": 0.8715834021568298, "kl": 0.04109402373433113, "learning_rate": 4.998452605684874e-06, "loss": 0.0001, "num_tokens": 20822352.0, "reward": 0.8701171875, "reward_std": 0.10361048579216003, "rewards/reward_len/mean": 0.8701171875, "rewards/reward_len/std": 0.2553950250148773, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03424, "frac_reward_zero_std": 0.375, "grad_norm": 0.7118536829948425, "kl": 0.04294419288635254, "learning_rate": 4.9984229942680915e-06, "loss": 0.0001, "num_tokens": 21018672.0, "reward": 0.8587239384651184, "reward_std": 0.1177600771188736, "rewards/reward_len/mean": 0.8587239980697632, "rewards/reward_len/std": 0.2804560959339142, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03456, "frac_reward_zero_std": 0.375, "grad_norm": 0.7399114370346069, "kl": 0.045280821621418, "learning_rate": 4.998393102292986e-06, "loss": 0.0001, "num_tokens": 21215200.0, "reward": 0.8173828125, "reward_std": 0.15532180666923523, "rewards/reward_len/mean": 0.8173828125, "rewards/reward_len/std": 0.2934293746948242, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03488, "frac_reward_zero_std": 0.5, "grad_norm": 0.6489234566688538, "kl": 0.04632344841957092, "learning_rate": 4.998362929762916e-06, "loss": 0.0001, "num_tokens": 21411504.0, "reward": 0.8942057490348816, "reward_std": 0.09123853594064713, "rewards/reward_len/mean": 0.8942057490348816, "rewards/reward_len/std": 0.1944492906332016, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0352, "frac_reward_zero_std": 0.625, "grad_norm": 0.54668790102005, "kl": 0.05354103446006775, "learning_rate": 4.998332476681267e-06, "loss": 0.0001, "num_tokens": 21607696.0, "reward": 0.736328125, "reward_std": 0.10400310903787613, "rewards/reward_len/mean": 0.736328125, "rewards/reward_len/std": 0.3949955999851227, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03552, "frac_reward_zero_std": 0.5, "grad_norm": 0.7160924077033997, "kl": 0.049541719257831573, "learning_rate": 4.998301743051459e-06, "loss": 0.0001, "num_tokens": 21803984.0, "reward": 0.9554036855697632, "reward_std": 0.097760871052742, "rewards/reward_len/mean": 0.9554036259651184, "rewards/reward_len/std": 0.16516345739364624, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03584, "frac_reward_zero_std": 0.6875, "grad_norm": 0.562833309173584, "kl": 0.04444808512926102, "learning_rate": 4.998270728876944e-06, "loss": 0.0001, "num_tokens": 22000272.0, "reward": 0.8658854365348816, "reward_std": 0.035549990832805634, "rewards/reward_len/mean": 0.8658854365348816, "rewards/reward_len/std": 0.29662853479385376, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03616, "frac_reward_zero_std": 0.3125, "grad_norm": 0.9137011766433716, "kl": 0.043288059532642365, "learning_rate": 4.998239434161205e-06, "loss": 0.0001, "num_tokens": 22196688.0, "reward": 0.890625, "reward_std": 0.14863429963588715, "rewards/reward_len/mean": 0.890625, "rewards/reward_len/std": 0.24836063385009766, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03648, "frac_reward_zero_std": 0.75, "grad_norm": 0.44634729623794556, "kl": 0.05052672326564789, "learning_rate": 4.998207858907756e-06, "loss": 0.0001, "num_tokens": 22393200.0, "reward": 0.8876953125, "reward_std": 0.0483868271112442, "rewards/reward_len/mean": 0.8876953125, "rewards/reward_len/std": 0.22397543489933014, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0368, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5135541558265686, "kl": 0.048797838389873505, "learning_rate": 4.9981760031201425e-06, "loss": 0.0001, "num_tokens": 22589600.0, "reward": 0.87109375, "reward_std": 0.04432977735996246, "rewards/reward_len/mean": 0.87109375, "rewards/reward_len/std": 0.33388179540634155, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03712, "frac_reward_zero_std": 0.5, "grad_norm": 0.6542080044746399, "kl": 0.04447535425424576, "learning_rate": 4.998143866801941e-06, "loss": 0.0001, "num_tokens": 22786080.0, "reward": 0.9710286855697632, "reward_std": 0.0813046470284462, "rewards/reward_len/mean": 0.9710286259651184, "rewards/reward_len/std": 0.13284824788570404, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03744, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6921005249023438, "kl": 0.0447402223944664, "learning_rate": 4.998111449956763e-06, "loss": 0.0001, "num_tokens": 22982672.0, "reward": 0.9371744990348816, "reward_std": 0.07981333136558533, "rewards/reward_len/mean": 0.9371744990348816, "rewards/reward_len/std": 0.1854964643716812, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03776, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5102068781852722, "kl": 0.04590785875916481, "learning_rate": 4.9980787525882445e-06, "loss": 0.0001, "num_tokens": 23179008.0, "reward": 0.810546875, "reward_std": 0.08155781030654907, "rewards/reward_len/mean": 0.810546875, "rewards/reward_len/std": 0.35939234495162964, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03808, "frac_reward_zero_std": 0.375, "grad_norm": 0.8284937143325806, "kl": 0.04901726171374321, "learning_rate": 4.99804577470006e-06, "loss": 0.0001, "num_tokens": 23375472.0, "reward": 0.77734375, "reward_std": 0.1501484215259552, "rewards/reward_len/mean": 0.77734375, "rewards/reward_len/std": 0.32247161865234375, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0384, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6987369656562805, "kl": 0.05322997272014618, "learning_rate": 4.998012516295912e-06, "loss": 0.0001, "num_tokens": 23572064.0, "reward": 0.8199869990348816, "reward_std": 0.1454310119152069, "rewards/reward_len/mean": 0.8199869394302368, "rewards/reward_len/std": 0.3523355722427368, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03872, "frac_reward_zero_std": 0.375, "grad_norm": 0.6875567436218262, "kl": 0.06371929496526718, "learning_rate": 4.9979789773795365e-06, "loss": 0.0001, "num_tokens": 23768672.0, "reward": 0.8990885615348816, "reward_std": 0.11863580346107483, "rewards/reward_len/mean": 0.8990885615348816, "rewards/reward_len/std": 0.2285819798707962, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03904, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7178421020507812, "kl": 0.05132008343935013, "learning_rate": 4.997945157954698e-06, "loss": 0.0001, "num_tokens": 23965184.0, "reward": 0.8841146230697632, "reward_std": 0.11480996012687683, "rewards/reward_len/mean": 0.8841146230697632, "rewards/reward_len/std": 0.2824253737926483, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03936, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8206834197044373, "kl": 0.053480520844459534, "learning_rate": 4.997911058025194e-06, "loss": 0.0001, "num_tokens": 24161696.0, "reward": 0.9150390625, "reward_std": 0.13156206905841827, "rewards/reward_len/mean": 0.9150390625, "rewards/reward_len/std": 0.21395376324653625, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.03968, "frac_reward_zero_std": 0.5, "grad_norm": 0.6942713856697083, "kl": 0.047945812344551086, "learning_rate": 4.997876677594856e-06, "loss": 0.0001, "num_tokens": 24358080.0, "reward": 0.814453125, "reward_std": 0.14690063893795013, "rewards/reward_len/mean": 0.814453125, "rewards/reward_len/std": 0.43281957507133484, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04, "frac_reward_zero_std": 0.375, "grad_norm": 0.8228172659873962, "kl": 0.06006351858377457, "learning_rate": 4.997842016667542e-06, "loss": 0.0001, "num_tokens": 24554560.0, "reward": 0.8570963144302368, "reward_std": 0.16265812516212463, "rewards/reward_len/mean": 0.8570963740348816, "rewards/reward_len/std": 0.27227476239204407, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04032, "frac_reward_zero_std": 0.1875, "grad_norm": 0.8444290161132812, "kl": 0.07235468924045563, "learning_rate": 4.997807075247147e-06, "loss": 0.0001, "num_tokens": 24750992.0, "reward": 0.8082682490348816, "reward_std": 0.1908263862133026, "rewards/reward_len/mean": 0.8082681894302368, "rewards/reward_len/std": 0.31802427768707275, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04064, "frac_reward_zero_std": 0.625, "grad_norm": 0.6670640707015991, "kl": 0.05588344857096672, "learning_rate": 4.997771853337592e-06, "loss": 0.0001, "num_tokens": 24947424.0, "reward": 0.888671875, "reward_std": 0.08293865621089935, "rewards/reward_len/mean": 0.888671875, "rewards/reward_len/std": 0.2808866798877716, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04096, "frac_reward_zero_std": 0.5, "grad_norm": 0.8786109685897827, "kl": 0.058647677302360535, "learning_rate": 4.997736350942834e-06, "loss": 0.0001, "num_tokens": 25143776.0, "reward": 0.8258463740348816, "reward_std": 0.13181406259536743, "rewards/reward_len/mean": 0.8258463740348816, "rewards/reward_len/std": 0.34247806668281555, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04128, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8026303648948669, "kl": 0.05989539623260498, "learning_rate": 4.997700568066858e-06, "loss": 0.0001, "num_tokens": 25340320.0, "reward": 0.8766276240348816, "reward_std": 0.18203502893447876, "rewards/reward_len/mean": 0.8766276240348816, "rewards/reward_len/std": 0.27603551745414734, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0416, "frac_reward_zero_std": 0.625, "grad_norm": 0.5540552735328674, "kl": 0.06203462556004524, "learning_rate": 4.997664504713684e-06, "loss": 0.0001, "num_tokens": 25536608.0, "reward": 0.9339193105697632, "reward_std": 0.059743039309978485, "rewards/reward_len/mean": 0.9339193105697632, "rewards/reward_len/std": 0.19678981602191925, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04192, "frac_reward_zero_std": 0.25, "grad_norm": 0.8077885508537292, "kl": 0.06243734434247017, "learning_rate": 4.997628160887361e-06, "loss": 0.0001, "num_tokens": 25733328.0, "reward": 0.8020833730697632, "reward_std": 0.19229719042778015, "rewards/reward_len/mean": 0.8020833730697632, "rewards/reward_len/std": 0.31180477142333984, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04224, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6901979446411133, "kl": 0.0574084147810936, "learning_rate": 4.99759153659197e-06, "loss": 0.0001, "num_tokens": 25929872.0, "reward": 0.8509114980697632, "reward_std": 0.09985854476690292, "rewards/reward_len/mean": 0.8509114980697632, "rewards/reward_len/std": 0.2965216636657715, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04256, "frac_reward_zero_std": 0.625, "grad_norm": 0.613499104976654, "kl": 0.0554431788623333, "learning_rate": 4.997554631831625e-06, "loss": 0.0001, "num_tokens": 26126112.0, "reward": 0.9173177480697632, "reward_std": 0.06885071843862534, "rewards/reward_len/mean": 0.9173177480697632, "rewards/reward_len/std": 0.24543680250644684, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04288, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7758867144584656, "kl": 0.059054940938949585, "learning_rate": 4.9975174466104685e-06, "loss": 0.0001, "num_tokens": 26322432.0, "reward": 0.9397786855697632, "reward_std": 0.10679379105567932, "rewards/reward_len/mean": 0.9397786855697632, "rewards/reward_len/std": 0.17176108062267303, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0432, "frac_reward_zero_std": 0.625, "grad_norm": 0.6877766847610474, "kl": 0.06932512670755386, "learning_rate": 4.997479980932677e-06, "loss": 0.0001, "num_tokens": 26518912.0, "reward": 0.9130859375, "reward_std": 0.07892313599586487, "rewards/reward_len/mean": 0.9130859375, "rewards/reward_len/std": 0.24368536472320557, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04352, "frac_reward_zero_std": 0.5, "grad_norm": 0.69096839427948, "kl": 0.07596761733293533, "learning_rate": 4.9974422348024565e-06, "loss": 0.0002, "num_tokens": 26715184.0, "reward": 0.8831380605697632, "reward_std": 0.08974990248680115, "rewards/reward_len/mean": 0.8831380009651184, "rewards/reward_len/std": 0.2887486517429352, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04384, "frac_reward_zero_std": 0.625, "grad_norm": 0.4813261330127716, "kl": 0.07688115537166595, "learning_rate": 4.997404208224048e-06, "loss": 0.0002, "num_tokens": 26911488.0, "reward": 0.8714193105697632, "reward_std": 0.07773511111736298, "rewards/reward_len/mean": 0.8714192509651184, "rewards/reward_len/std": 0.3241180181503296, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04416, "frac_reward_zero_std": 0.5, "grad_norm": 0.6295547485351562, "kl": 0.0780554711818695, "learning_rate": 4.99736590120172e-06, "loss": 0.0002, "num_tokens": 27107776.0, "reward": 0.9052734375, "reward_std": 0.09226653724908829, "rewards/reward_len/mean": 0.9052734375, "rewards/reward_len/std": 0.2514779269695282, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04448, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7272562384605408, "kl": 0.06095842272043228, "learning_rate": 4.997327313739774e-06, "loss": 0.0001, "num_tokens": 27304128.0, "reward": 0.8610026240348816, "reward_std": 0.10588456690311432, "rewards/reward_len/mean": 0.8610026240348816, "rewards/reward_len/std": 0.26490315794944763, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0448, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7675761580467224, "kl": 0.06698419898748398, "learning_rate": 4.997288445842543e-06, "loss": 0.0001, "num_tokens": 27500432.0, "reward": 0.9436849355697632, "reward_std": 0.10129431635141373, "rewards/reward_len/mean": 0.9436849355697632, "rewards/reward_len/std": 0.16303977370262146, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04512, "frac_reward_zero_std": 0.375, "grad_norm": 0.8226706981658936, "kl": 0.07435575127601624, "learning_rate": 4.9972492975143936e-06, "loss": 0.0001, "num_tokens": 27696752.0, "reward": 0.8492838740348816, "reward_std": 0.18206046521663666, "rewards/reward_len/mean": 0.8492838740348816, "rewards/reward_len/std": 0.3009435534477234, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04544, "frac_reward_zero_std": 0.75, "grad_norm": 0.4597032368183136, "kl": 0.06777079403400421, "learning_rate": 4.99720986875972e-06, "loss": 0.0001, "num_tokens": 27893152.0, "reward": 0.8430989980697632, "reward_std": 0.06990087777376175, "rewards/reward_len/mean": 0.8430989980697632, "rewards/reward_len/std": 0.34923213720321655, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04576, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6893799901008606, "kl": 0.08895497024059296, "learning_rate": 4.99717015958295e-06, "loss": 0.0002, "num_tokens": 28089376.0, "reward": 0.8955078125, "reward_std": 0.11968058347702026, "rewards/reward_len/mean": 0.8955078125, "rewards/reward_len/std": 0.2851869463920593, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04608, "frac_reward_zero_std": 0.6875, "grad_norm": 0.46750542521476746, "kl": 0.0704973116517067, "learning_rate": 4.997130169988544e-06, "loss": 0.0001, "num_tokens": 28285792.0, "reward": 0.880859375, "reward_std": 0.08731576055288315, "rewards/reward_len/mean": 0.880859375, "rewards/reward_len/std": 0.3146190643310547, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0464, "frac_reward_zero_std": 0.625, "grad_norm": 0.5409395694732666, "kl": 0.08054986596107483, "learning_rate": 4.997089899980991e-06, "loss": 0.0002, "num_tokens": 28482320.0, "reward": 0.8756510615348816, "reward_std": 0.06449456512928009, "rewards/reward_len/mean": 0.8756510615348816, "rewards/reward_len/std": 0.24709556996822357, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04672, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7810068726539612, "kl": 0.0692010149359703, "learning_rate": 4.997049349564814e-06, "loss": 0.0001, "num_tokens": 28678864.0, "reward": 0.8430989980697632, "reward_std": 0.1321367472410202, "rewards/reward_len/mean": 0.8430989384651184, "rewards/reward_len/std": 0.31322017312049866, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04704, "frac_reward_zero_std": 0.375, "grad_norm": 0.7886852622032166, "kl": 0.07064078748226166, "learning_rate": 4.997008518744567e-06, "loss": 0.0001, "num_tokens": 28875360.0, "reward": 0.92578125, "reward_std": 0.13618791103363037, "rewards/reward_len/mean": 0.92578125, "rewards/reward_len/std": 0.19223327934741974, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04736, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6672191619873047, "kl": 0.08151240646839142, "learning_rate": 4.996967407524833e-06, "loss": 0.0002, "num_tokens": 29071872.0, "reward": 0.9332682490348816, "reward_std": 0.09509018808603287, "rewards/reward_len/mean": 0.9332681894302368, "rewards/reward_len/std": 0.20524442195892334, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04768, "frac_reward_zero_std": 0.4375, "grad_norm": 0.775506854057312, "kl": 0.091464102268219, "learning_rate": 4.996926015910231e-06, "loss": 0.0002, "num_tokens": 29268400.0, "reward": 0.9453125, "reward_std": 0.11318354308605194, "rewards/reward_len/mean": 0.9453125, "rewards/reward_len/std": 0.17195451259613037, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.048, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5876510143280029, "kl": 0.06707565486431122, "learning_rate": 4.996884343905409e-06, "loss": 0.0001, "num_tokens": 29464848.0, "reward": 0.953125, "reward_std": 0.08225987106561661, "rewards/reward_len/mean": 0.953125, "rewards/reward_len/std": 0.15577132999897003, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04832, "frac_reward_zero_std": 0.6875, "grad_norm": 0.494113564491272, "kl": 0.11548813432455063, "learning_rate": 4.996842391515045e-06, "loss": 0.0002, "num_tokens": 29661232.0, "reward": 0.8388671875, "reward_std": 0.0670529156923294, "rewards/reward_len/mean": 0.8388671875, "rewards/reward_len/std": 0.3051782250404358, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04864, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7089921832084656, "kl": 0.09647858887910843, "learning_rate": 4.996800158743849e-06, "loss": 0.0002, "num_tokens": 29857712.0, "reward": 0.8636068105697632, "reward_std": 0.12918898463249207, "rewards/reward_len/mean": 0.8636067509651184, "rewards/reward_len/std": 0.31359949707984924, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04896, "frac_reward_zero_std": 0.4375, "grad_norm": 1.052402377128601, "kl": 0.0719536617398262, "learning_rate": 4.996757645596566e-06, "loss": 0.0001, "num_tokens": 30054384.0, "reward": 0.8997396230697632, "reward_std": 0.10692217200994492, "rewards/reward_len/mean": 0.8997396230697632, "rewards/reward_len/std": 0.2012649029493332, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04928, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5026658177375793, "kl": 0.07513917982578278, "learning_rate": 4.996714852077969e-06, "loss": 0.0002, "num_tokens": 30250880.0, "reward": 0.8623046875, "reward_std": 0.07318128645420074, "rewards/reward_len/mean": 0.8623046875, "rewards/reward_len/std": 0.27026063203811646, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0496, "frac_reward_zero_std": 0.375, "grad_norm": 0.7499704957008362, "kl": 0.07844732701778412, "learning_rate": 4.996671778192864e-06, "loss": 0.0002, "num_tokens": 30447248.0, "reward": 0.8932291865348816, "reward_std": 0.11607424914836884, "rewards/reward_len/mean": 0.8932291865348816, "rewards/reward_len/std": 0.26174741983413696, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.04992, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6838570237159729, "kl": 0.09508460760116577, "learning_rate": 4.9966284239460875e-06, "loss": 0.0002, "num_tokens": 30643904.0, "reward": 0.8636068105697632, "reward_std": 0.08040210604667664, "rewards/reward_len/mean": 0.8636068105697632, "rewards/reward_len/std": 0.3357435464859009, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05024, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6414079070091248, "kl": 0.07155738770961761, "learning_rate": 4.996584789342507e-06, "loss": 0.0001, "num_tokens": 30840336.0, "reward": 0.8600260615348816, "reward_std": 0.1114499568939209, "rewards/reward_len/mean": 0.8600260615348816, "rewards/reward_len/std": 0.2660805881023407, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05056, "frac_reward_zero_std": 0.375, "grad_norm": 0.8382083773612976, "kl": 0.0742887407541275, "learning_rate": 4.996540874387024e-06, "loss": 0.0001, "num_tokens": 31036592.0, "reward": 0.9036458730697632, "reward_std": 0.1053105965256691, "rewards/reward_len/mean": 0.9036458730697632, "rewards/reward_len/std": 0.2511276304721832, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05088, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5718387365341187, "kl": 0.07484649121761322, "learning_rate": 4.9964966790845685e-06, "loss": 0.0001, "num_tokens": 31232976.0, "reward": 0.98046875, "reward_std": 0.05872027575969696, "rewards/reward_len/mean": 0.98046875, "rewards/reward_len/std": 0.10970567911863327, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0512, "frac_reward_zero_std": 0.5625, "grad_norm": 0.796229362487793, "kl": 0.0879141315817833, "learning_rate": 4.996452203440105e-06, "loss": 0.0002, "num_tokens": 31429024.0, "reward": 0.8606771230697632, "reward_std": 0.0986185222864151, "rewards/reward_len/mean": 0.8606771230697632, "rewards/reward_len/std": 0.3199280798435211, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05152, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6431540846824646, "kl": 0.10569047182798386, "learning_rate": 4.996407447458626e-06, "loss": 0.0002, "num_tokens": 31625408.0, "reward": 0.8317057490348816, "reward_std": 0.1799091100692749, "rewards/reward_len/mean": 0.8317057490348816, "rewards/reward_len/std": 0.34807682037353516, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05184, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5369621515274048, "kl": 0.07548253238201141, "learning_rate": 4.996362411145159e-06, "loss": 0.0002, "num_tokens": 31821760.0, "reward": 0.8541666865348816, "reward_std": 0.0570962056517601, "rewards/reward_len/mean": 0.8541666865348816, "rewards/reward_len/std": 0.2863069772720337, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05216, "frac_reward_zero_std": 0.25, "grad_norm": 0.7474683523178101, "kl": 0.11654243618249893, "learning_rate": 4.99631709450476e-06, "loss": 0.0002, "num_tokens": 32018288.0, "reward": 0.7522786855697632, "reward_std": 0.1333135962486267, "rewards/reward_len/mean": 0.7522786855697632, "rewards/reward_len/std": 0.4416623115539551, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05248, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7331627011299133, "kl": 0.07771243155002594, "learning_rate": 4.996271497542518e-06, "loss": 0.0002, "num_tokens": 32214752.0, "reward": 0.93359375, "reward_std": 0.09760457277297974, "rewards/reward_len/mean": 0.93359375, "rewards/reward_len/std": 0.17891569435596466, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0528, "frac_reward_zero_std": 0.625, "grad_norm": 0.5976435542106628, "kl": 0.09327774494886398, "learning_rate": 4.996225620263554e-06, "loss": 0.0002, "num_tokens": 32411120.0, "reward": 0.9322916865348816, "reward_std": 0.062818244099617, "rewards/reward_len/mean": 0.9322916865348816, "rewards/reward_len/std": 0.16674835979938507, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05312, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6860938668251038, "kl": 0.10673967748880386, "learning_rate": 4.99617946267302e-06, "loss": 0.0002, "num_tokens": 32607824.0, "reward": 0.7692056894302368, "reward_std": 0.12807920575141907, "rewards/reward_len/mean": 0.7692056894302368, "rewards/reward_len/std": 0.3823944926261902, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05344, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6387882232666016, "kl": 0.11012360453605652, "learning_rate": 4.996133024776097e-06, "loss": 0.0002, "num_tokens": 32803904.0, "reward": 0.8720703125, "reward_std": 0.07694263756275177, "rewards/reward_len/mean": 0.8720703125, "rewards/reward_len/std": 0.24586358666419983, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05376, "frac_reward_zero_std": 0.625, "grad_norm": 0.6139965653419495, "kl": 0.082235187292099, "learning_rate": 4.996086306578003e-06, "loss": 0.0002, "num_tokens": 33000256.0, "reward": 0.7880859375, "reward_std": 0.10399128496646881, "rewards/reward_len/mean": 0.7880859375, "rewards/reward_len/std": 0.3888956904411316, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05408, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7274710536003113, "kl": 0.07256945967674255, "learning_rate": 4.996039308083982e-06, "loss": 0.0001, "num_tokens": 33196848.0, "reward": 0.9479166865348816, "reward_std": 0.08178172260522842, "rewards/reward_len/mean": 0.9479166865348816, "rewards/reward_len/std": 0.17022348940372467, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0544, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7237064242362976, "kl": 0.088614322245121, "learning_rate": 4.995992029299312e-06, "loss": 0.0002, "num_tokens": 33393216.0, "reward": 0.8675130605697632, "reward_std": 0.087496817111969, "rewards/reward_len/mean": 0.8675130009651184, "rewards/reward_len/std": 0.3058050274848938, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05472, "frac_reward_zero_std": 0.625, "grad_norm": 0.5772611498832703, "kl": 0.10985636711120605, "learning_rate": 4.9959444702293025e-06, "loss": 0.0002, "num_tokens": 33589552.0, "reward": 0.8987630605697632, "reward_std": 0.07923159748315811, "rewards/reward_len/mean": 0.8987630605697632, "rewards/reward_len/std": 0.26940909028053284, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05504, "frac_reward_zero_std": 0.75, "grad_norm": 0.6074258089065552, "kl": 0.08521751314401627, "learning_rate": 4.995896630879294e-06, "loss": 0.0002, "num_tokens": 33785888.0, "reward": 0.8603515625, "reward_std": 0.04834362119436264, "rewards/reward_len/mean": 0.8603515625, "rewards/reward_len/std": 0.2805461585521698, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05536, "frac_reward_zero_std": 0.5, "grad_norm": 0.9125682711601257, "kl": 0.08923150599002838, "learning_rate": 4.995848511254657e-06, "loss": 0.0002, "num_tokens": 33982432.0, "reward": 0.8512369990348816, "reward_std": 0.1278519332408905, "rewards/reward_len/mean": 0.8512369394302368, "rewards/reward_len/std": 0.3142920732498169, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05568, "frac_reward_zero_std": 0.625, "grad_norm": 0.8972249627113342, "kl": 0.08425841480493546, "learning_rate": 4.995800111360798e-06, "loss": 0.0002, "num_tokens": 34178720.0, "reward": 0.9485677480697632, "reward_std": 0.08281566202640533, "rewards/reward_len/mean": 0.9485676884651184, "rewards/reward_len/std": 0.16881656646728516, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.056, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5237225294113159, "kl": 0.07367609441280365, "learning_rate": 4.99575143120315e-06, "loss": 0.0001, "num_tokens": 34375248.0, "reward": 0.947265625, "reward_std": 0.05978371948003769, "rewards/reward_len/mean": 0.947265625, "rewards/reward_len/std": 0.16265468299388885, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05632, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7218496799468994, "kl": 0.07055965065956116, "learning_rate": 4.99570247078718e-06, "loss": 0.0001, "num_tokens": 34571712.0, "reward": 0.8899739980697632, "reward_std": 0.09408305585384369, "rewards/reward_len/mean": 0.8899739384651184, "rewards/reward_len/std": 0.2582903802394867, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05664, "frac_reward_zero_std": 0.5, "grad_norm": 0.7369672060012817, "kl": 0.08981002867221832, "learning_rate": 4.995653230118387e-06, "loss": 0.0002, "num_tokens": 34768144.0, "reward": 0.8873698115348816, "reward_std": 0.10280534625053406, "rewards/reward_len/mean": 0.8873698115348816, "rewards/reward_len/std": 0.286419153213501, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05696, "frac_reward_zero_std": 0.75, "grad_norm": 0.5606592893600464, "kl": 0.10738368332386017, "learning_rate": 4.995603709202298e-06, "loss": 0.0002, "num_tokens": 34964592.0, "reward": 0.810546875, "reward_std": 0.10605622082948685, "rewards/reward_len/mean": 0.810546875, "rewards/reward_len/std": 0.44258782267570496, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05728, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8137508630752563, "kl": 0.11065888404846191, "learning_rate": 4.995553908044476e-06, "loss": 0.0002, "num_tokens": 35160912.0, "reward": 0.9029948115348816, "reward_std": 0.12246888875961304, "rewards/reward_len/mean": 0.9029947519302368, "rewards/reward_len/std": 0.2586788535118103, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0576, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6109806299209595, "kl": 0.09764401614665985, "learning_rate": 4.995503826650513e-06, "loss": 0.0002, "num_tokens": 35357216.0, "reward": 0.8460286855697632, "reward_std": 0.11547398567199707, "rewards/reward_len/mean": 0.8460286855697632, "rewards/reward_len/std": 0.2945149540901184, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05792, "frac_reward_zero_std": 0.625, "grad_norm": 0.600672721862793, "kl": 0.08681796491146088, "learning_rate": 4.995453465026033e-06, "loss": 0.0002, "num_tokens": 35553792.0, "reward": 0.9088541865348816, "reward_std": 0.06625154614448547, "rewards/reward_len/mean": 0.9088541865348816, "rewards/reward_len/std": 0.2561737895011902, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05824, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7872768044471741, "kl": 0.10905075073242188, "learning_rate": 4.99540282317669e-06, "loss": 0.0002, "num_tokens": 35750384.0, "reward": 0.8665364980697632, "reward_std": 0.12869799137115479, "rewards/reward_len/mean": 0.8665364384651184, "rewards/reward_len/std": 0.2515398859977722, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05856, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8691167831420898, "kl": 0.167855367064476, "learning_rate": 4.995351901108172e-06, "loss": 0.0003, "num_tokens": 35946880.0, "reward": 0.8404948115348816, "reward_std": 0.125442236661911, "rewards/reward_len/mean": 0.8404948115348816, "rewards/reward_len/std": 0.30679085850715637, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05888, "frac_reward_zero_std": 0.5, "grad_norm": 0.7196409106254578, "kl": 0.1253705620765686, "learning_rate": 4.995300698826197e-06, "loss": 0.0003, "num_tokens": 36143328.0, "reward": 0.8225911855697632, "reward_std": 0.14346560835838318, "rewards/reward_len/mean": 0.8225911855697632, "rewards/reward_len/std": 0.311837375164032, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0592, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5562202334403992, "kl": 0.08404836058616638, "learning_rate": 4.995249216336514e-06, "loss": 0.0002, "num_tokens": 36339808.0, "reward": 0.9850260615348816, "reward_std": 0.04655185341835022, "rewards/reward_len/mean": 0.9850260615348816, "rewards/reward_len/std": 0.08474850654602051, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05952, "frac_reward_zero_std": 0.5, "grad_norm": 0.7566428184509277, "kl": 0.09097443521022797, "learning_rate": 4.9951974536449055e-06, "loss": 0.0002, "num_tokens": 36536000.0, "reward": 0.9091796875, "reward_std": 0.08556301891803741, "rewards/reward_len/mean": 0.9091796875, "rewards/reward_len/std": 0.20630934834480286, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.05984, "frac_reward_zero_std": 0.5, "grad_norm": 0.7050685286521912, "kl": 0.10840213298797607, "learning_rate": 4.995145410757183e-06, "loss": 0.0002, "num_tokens": 36732736.0, "reward": 0.833984375, "reward_std": 0.10334545373916626, "rewards/reward_len/mean": 0.833984375, "rewards/reward_len/std": 0.29914477467536926, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06016, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8541884422302246, "kl": 0.09325651079416275, "learning_rate": 4.9950930876791915e-06, "loss": 0.0002, "num_tokens": 36929248.0, "reward": 0.8362630605697632, "reward_std": 0.11429595947265625, "rewards/reward_len/mean": 0.8362630605697632, "rewards/reward_len/std": 0.29320016503334045, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06048, "frac_reward_zero_std": 0.75, "grad_norm": 0.634225606918335, "kl": 0.103565514087677, "learning_rate": 4.995040484416806e-06, "loss": 0.0002, "num_tokens": 37125792.0, "reward": 0.9817708730697632, "reward_std": 0.039173826575279236, "rewards/reward_len/mean": 0.9817708134651184, "rewards/reward_len/std": 0.0857095867395401, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0608, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6658182740211487, "kl": 0.09958794713020325, "learning_rate": 4.994987600975933e-06, "loss": 0.0002, "num_tokens": 37322032.0, "reward": 0.8639323115348816, "reward_std": 0.11056782305240631, "rewards/reward_len/mean": 0.8639322519302368, "rewards/reward_len/std": 0.3093268871307373, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06112, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5221922993659973, "kl": 0.10446944832801819, "learning_rate": 4.994934437362513e-06, "loss": 0.0002, "num_tokens": 37518768.0, "reward": 0.9013671875, "reward_std": 0.1011173278093338, "rewards/reward_len/mean": 0.9013671875, "rewards/reward_len/std": 0.25842928886413574, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06144, "frac_reward_zero_std": 0.6875, "grad_norm": 0.7104811668395996, "kl": 0.10434649139642715, "learning_rate": 4.994880993582514e-06, "loss": 0.0002, "num_tokens": 37715424.0, "reward": 0.9033203125, "reward_std": 0.07863481342792511, "rewards/reward_len/mean": 0.9033203125, "rewards/reward_len/std": 0.2374539077281952, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06176, "frac_reward_zero_std": 0.5, "grad_norm": 0.6666585803031921, "kl": 0.10489232838153839, "learning_rate": 4.994827269641939e-06, "loss": 0.0002, "num_tokens": 37911728.0, "reward": 0.9137369990348816, "reward_std": 0.07843010127544403, "rewards/reward_len/mean": 0.9137369990348816, "rewards/reward_len/std": 0.23643386363983154, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06208, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4212785065174103, "kl": 0.0997745543718338, "learning_rate": 4.994773265546818e-06, "loss": 0.0002, "num_tokens": 38108032.0, "reward": 0.876953125, "reward_std": 0.06341002136468887, "rewards/reward_len/mean": 0.876953125, "rewards/reward_len/std": 0.2841040790081024, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0624, "frac_reward_zero_std": 0.625, "grad_norm": 0.574761152267456, "kl": 0.08906789124011993, "learning_rate": 4.994718981303219e-06, "loss": 0.0002, "num_tokens": 38304560.0, "reward": 0.9212239980697632, "reward_std": 0.0782536193728447, "rewards/reward_len/mean": 0.9212239980697632, "rewards/reward_len/std": 0.2245325893163681, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06272, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5374192595481873, "kl": 0.16803714632987976, "learning_rate": 4.994664416917236e-06, "loss": 0.0003, "num_tokens": 38500848.0, "reward": 0.935546875, "reward_std": 0.10581074655056, "rewards/reward_len/mean": 0.935546875, "rewards/reward_len/std": 0.21074482798576355, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06304, "frac_reward_zero_std": 0.3125, "grad_norm": 0.8086875081062317, "kl": 0.1142357736825943, "learning_rate": 4.994609572394996e-06, "loss": 0.0002, "num_tokens": 38697392.0, "reward": 0.7705078125, "reward_std": 0.12558278441429138, "rewards/reward_len/mean": 0.7705078125, "rewards/reward_len/std": 0.3446405827999115, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06336, "frac_reward_zero_std": 0.6875, "grad_norm": 0.572655439376831, "kl": 0.09784573316574097, "learning_rate": 4.994554447742658e-06, "loss": 0.0002, "num_tokens": 38893952.0, "reward": 0.9313151240348816, "reward_std": 0.07897623628377914, "rewards/reward_len/mean": 0.9313151240348816, "rewards/reward_len/std": 0.19295108318328857, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06368, "frac_reward_zero_std": 0.625, "grad_norm": 0.5849019289016724, "kl": 0.15582725405693054, "learning_rate": 4.994499042966413e-06, "loss": 0.0003, "num_tokens": 39090032.0, "reward": 0.970703125, "reward_std": 0.05921054631471634, "rewards/reward_len/mean": 0.970703125, "rewards/reward_len/std": 0.11972638219594955, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.064, "frac_reward_zero_std": 0.625, "grad_norm": 0.5770314931869507, "kl": 0.11267239600419998, "learning_rate": 4.994443358072481e-06, "loss": 0.0002, "num_tokens": 39286400.0, "reward": 0.9407552480697632, "reward_std": 0.07146152853965759, "rewards/reward_len/mean": 0.9407552480697632, "rewards/reward_len/std": 0.18006666004657745, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06432, "frac_reward_zero_std": 0.5, "grad_norm": 0.7825047373771667, "kl": 0.10432711988687515, "learning_rate": 4.9943873930671175e-06, "loss": 0.0002, "num_tokens": 39482992.0, "reward": 0.9654948115348816, "reward_std": 0.06037842482328415, "rewards/reward_len/mean": 0.9654948115348816, "rewards/reward_len/std": 0.10307227820158005, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06464, "frac_reward_zero_std": 0.5, "grad_norm": 0.6958819031715393, "kl": 0.10829421877861023, "learning_rate": 4.994331147956604e-06, "loss": 0.0002, "num_tokens": 39679392.0, "reward": 0.9052734375, "reward_std": 0.1363808512687683, "rewards/reward_len/mean": 0.9052734375, "rewards/reward_len/std": 0.2566233277320862, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06496, "frac_reward_zero_std": 0.75, "grad_norm": 0.40797531604766846, "kl": 0.11702263355255127, "learning_rate": 4.994274622747259e-06, "loss": 0.0002, "num_tokens": 39875872.0, "reward": 0.9850260615348816, "reward_std": 0.0357811376452446, "rewards/reward_len/mean": 0.9850260615348816, "rewards/reward_len/std": 0.08180533349514008, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06528, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6733713746070862, "kl": 0.10767699778079987, "learning_rate": 4.9942178174454285e-06, "loss": 0.0002, "num_tokens": 40072176.0, "reward": 0.8287760615348816, "reward_std": 0.12422393262386322, "rewards/reward_len/mean": 0.8287760019302368, "rewards/reward_len/std": 0.3528299331665039, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0656, "frac_reward_zero_std": 0.625, "grad_norm": 0.6387746930122375, "kl": 0.09816346317529678, "learning_rate": 4.994160732057491e-06, "loss": 0.0002, "num_tokens": 40268768.0, "reward": 0.8727213740348816, "reward_std": 0.06526752561330795, "rewards/reward_len/mean": 0.8727213740348816, "rewards/reward_len/std": 0.24642369151115417, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06592, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4557179808616638, "kl": 0.10721514374017715, "learning_rate": 4.994103366589859e-06, "loss": 0.0002, "num_tokens": 40465296.0, "reward": 0.974609375, "reward_std": 0.03420029580593109, "rewards/reward_len/mean": 0.974609375, "rewards/reward_len/std": 0.08890436589717865, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06624, "frac_reward_zero_std": 0.625, "grad_norm": 0.7107056975364685, "kl": 0.09219434857368469, "learning_rate": 4.994045721048973e-06, "loss": 0.0002, "num_tokens": 40661936.0, "reward": 0.94921875, "reward_std": 0.05523230880498886, "rewards/reward_len/mean": 0.94921875, "rewards/reward_len/std": 0.16723249852657318, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06656, "frac_reward_zero_std": 0.625, "grad_norm": 0.5691456198692322, "kl": 0.10697256028652191, "learning_rate": 4.9939877954413065e-06, "loss": 0.0002, "num_tokens": 40858128.0, "reward": 0.939453125, "reward_std": 0.08196184039115906, "rewards/reward_len/mean": 0.939453125, "rewards/reward_len/std": 0.18530280888080597, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06688, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5829526782035828, "kl": 0.10628324747085571, "learning_rate": 4.993929589773363e-06, "loss": 0.0002, "num_tokens": 41054320.0, "reward": 0.8675130605697632, "reward_std": 0.049422577023506165, "rewards/reward_len/mean": 0.8675130605697632, "rewards/reward_len/std": 0.2868739664554596, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0672, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5931270122528076, "kl": 0.11139126121997833, "learning_rate": 4.993871104051681e-06, "loss": 0.0002, "num_tokens": 41250704.0, "reward": 0.8785807490348816, "reward_std": 0.06874778866767883, "rewards/reward_len/mean": 0.8785807490348816, "rewards/reward_len/std": 0.2760170102119446, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06752, "frac_reward_zero_std": 0.5, "grad_norm": 0.7122511863708496, "kl": 0.09654668718576431, "learning_rate": 4.993812338282826e-06, "loss": 0.0002, "num_tokens": 41447072.0, "reward": 0.9404296875, "reward_std": 0.09347277879714966, "rewards/reward_len/mean": 0.9404296875, "rewards/reward_len/std": 0.20118004083633423, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06784, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6578711271286011, "kl": 0.10473594069480896, "learning_rate": 4.993753292473398e-06, "loss": 0.0002, "num_tokens": 41643472.0, "reward": 0.8834635615348816, "reward_std": 0.0690227597951889, "rewards/reward_len/mean": 0.8834635615348816, "rewards/reward_len/std": 0.27412763237953186, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06816, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5426110625267029, "kl": 0.19027242064476013, "learning_rate": 4.993693966630028e-06, "loss": 0.0004, "num_tokens": 41840016.0, "reward": 0.8824869394302368, "reward_std": 0.08826197683811188, "rewards/reward_len/mean": 0.8824869394302368, "rewards/reward_len/std": 0.2589326500892639, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06848, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5578992962837219, "kl": 0.11241623759269714, "learning_rate": 4.993634360759376e-06, "loss": 0.0002, "num_tokens": 42036432.0, "reward": 0.912109375, "reward_std": 0.07538636028766632, "rewards/reward_len/mean": 0.912109375, "rewards/reward_len/std": 0.25942131876945496, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0688, "frac_reward_zero_std": 0.5, "grad_norm": 0.6526371836662292, "kl": 0.10802334547042847, "learning_rate": 4.993574474868138e-06, "loss": 0.0002, "num_tokens": 42233024.0, "reward": 0.9518229365348816, "reward_std": 0.07811720669269562, "rewards/reward_len/mean": 0.9518228769302368, "rewards/reward_len/std": 0.14180956780910492, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06912, "frac_reward_zero_std": 0.625, "grad_norm": 0.5860698819160461, "kl": 0.1582513451576233, "learning_rate": 4.993514308963037e-06, "loss": 0.0003, "num_tokens": 42429392.0, "reward": 0.9248046875, "reward_std": 0.07701393961906433, "rewards/reward_len/mean": 0.9248046875, "rewards/reward_len/std": 0.18848660588264465, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06944, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6786426901817322, "kl": 0.10337725281715393, "learning_rate": 4.9934538630508285e-06, "loss": 0.0002, "num_tokens": 42625664.0, "reward": 0.9778646230697632, "reward_std": 0.07286320626735687, "rewards/reward_len/mean": 0.9778645634651184, "rewards/reward_len/std": 0.11853516846895218, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.06976, "frac_reward_zero_std": 0.375, "grad_norm": 0.812323272228241, "kl": 0.132625013589859, "learning_rate": 4.993393137138304e-06, "loss": 0.0003, "num_tokens": 42822384.0, "reward": 0.9127604365348816, "reward_std": 0.145424947142601, "rewards/reward_len/mean": 0.9127604365348816, "rewards/reward_len/std": 0.21827194094657898, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07008, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3683765232563019, "kl": 0.10570649802684784, "learning_rate": 4.993332131232278e-06, "loss": 0.0002, "num_tokens": 43018848.0, "reward": 0.9915364980697632, "reward_std": 0.0338541679084301, "rewards/reward_len/mean": 0.9915364980697632, "rewards/reward_len/std": 0.08107384294271469, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0704, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6045786738395691, "kl": 0.10861130058765411, "learning_rate": 4.993270845339605e-06, "loss": 0.0002, "num_tokens": 43215248.0, "reward": 0.8395182490348816, "reward_std": 0.12501531839370728, "rewards/reward_len/mean": 0.8395182490348816, "rewards/reward_len/std": 0.37535715103149414, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07072, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6509298086166382, "kl": 0.14409491419792175, "learning_rate": 4.993209279467164e-06, "loss": 0.0003, "num_tokens": 43411488.0, "reward": 0.8678385615348816, "reward_std": 0.05664858967065811, "rewards/reward_len/mean": 0.8678385615348816, "rewards/reward_len/std": 0.3113725483417511, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07104, "frac_reward_zero_std": 0.4375, "grad_norm": 0.766272783279419, "kl": 0.12992094457149506, "learning_rate": 4.993147433621871e-06, "loss": 0.0003, "num_tokens": 43608000.0, "reward": 0.8860677480697632, "reward_std": 0.1661549210548401, "rewards/reward_len/mean": 0.8860676884651184, "rewards/reward_len/std": 0.299485981464386, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07136, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7780870795249939, "kl": 0.1114724725484848, "learning_rate": 4.993085307810669e-06, "loss": 0.0002, "num_tokens": 43804672.0, "reward": 0.8756510019302368, "reward_std": 0.14009106159210205, "rewards/reward_len/mean": 0.8756510615348816, "rewards/reward_len/std": 0.2563987374305725, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07168, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5877574682235718, "kl": 0.11648842692375183, "learning_rate": 4.993022902040535e-06, "loss": 0.0002, "num_tokens": 44000832.0, "reward": 0.9088541865348816, "reward_std": 0.07611465454101562, "rewards/reward_len/mean": 0.9088541865348816, "rewards/reward_len/std": 0.21413491666316986, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.072, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6388769149780273, "kl": 0.11433035880327225, "learning_rate": 4.992960216318478e-06, "loss": 0.0002, "num_tokens": 44197408.0, "reward": 0.8590494990348816, "reward_std": 0.10520598292350769, "rewards/reward_len/mean": 0.8590494990348816, "rewards/reward_len/std": 0.25311896204948425, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07232, "frac_reward_zero_std": 0.5, "grad_norm": 0.8778823018074036, "kl": 0.12935525178909302, "learning_rate": 4.992897250651535e-06, "loss": 0.0003, "num_tokens": 44393952.0, "reward": 0.9541015625, "reward_std": 0.07013829797506332, "rewards/reward_len/mean": 0.9541015625, "rewards/reward_len/std": 0.1313309520483017, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07264, "frac_reward_zero_std": 0.5, "grad_norm": 0.6829720139503479, "kl": 0.1428414285182953, "learning_rate": 4.9928340050467785e-06, "loss": 0.0003, "num_tokens": 44590448.0, "reward": 0.7809244990348816, "reward_std": 0.14423221349716187, "rewards/reward_len/mean": 0.7809244394302368, "rewards/reward_len/std": 0.3772444427013397, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07296, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7422612905502319, "kl": 0.11076122522354126, "learning_rate": 4.99277047951131e-06, "loss": 0.0002, "num_tokens": 44786752.0, "reward": 0.8544921875, "reward_std": 0.0927259624004364, "rewards/reward_len/mean": 0.8544921875, "rewards/reward_len/std": 0.2795935571193695, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07328, "frac_reward_zero_std": 0.375, "grad_norm": 0.6979014873504639, "kl": 0.12275241315364838, "learning_rate": 4.992706674052263e-06, "loss": 0.0002, "num_tokens": 44983376.0, "reward": 0.919921875, "reward_std": 0.08856205642223358, "rewards/reward_len/mean": 0.919921875, "rewards/reward_len/std": 0.17647676169872284, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0736, "frac_reward_zero_std": 0.25, "grad_norm": 0.8511138558387756, "kl": 0.13473902642726898, "learning_rate": 4.992642588676802e-06, "loss": 0.0003, "num_tokens": 45179920.0, "reward": 0.873046875, "reward_std": 0.1388612985610962, "rewards/reward_len/mean": 0.873046875, "rewards/reward_len/std": 0.2508080303668976, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07392, "frac_reward_zero_std": 0.625, "grad_norm": 0.6099172830581665, "kl": 0.1405186653137207, "learning_rate": 4.992578223392124e-06, "loss": 0.0003, "num_tokens": 45376352.0, "reward": 0.9430338740348816, "reward_std": 0.07320138067007065, "rewards/reward_len/mean": 0.9430338740348816, "rewards/reward_len/std": 0.16079282760620117, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07424, "frac_reward_zero_std": 0.375, "grad_norm": 0.9191357493400574, "kl": 0.12696006894111633, "learning_rate": 4.992513578205457e-06, "loss": 0.0003, "num_tokens": 45572784.0, "reward": 0.9091796875, "reward_std": 0.13014641404151917, "rewards/reward_len/mean": 0.9091796875, "rewards/reward_len/std": 0.23203030228614807, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07456, "frac_reward_zero_std": 0.625, "grad_norm": 0.5489064455032349, "kl": 0.12983179092407227, "learning_rate": 4.99244865312406e-06, "loss": 0.0003, "num_tokens": 45769344.0, "reward": 0.9309896230697632, "reward_std": 0.07933264970779419, "rewards/reward_len/mean": 0.9309896230697632, "rewards/reward_len/std": 0.2320011854171753, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07488, "frac_reward_zero_std": 0.5, "grad_norm": 0.9211183786392212, "kl": 0.12873908877372742, "learning_rate": 4.992383448155224e-06, "loss": 0.0003, "num_tokens": 45965568.0, "reward": 0.9326171875, "reward_std": 0.06374112516641617, "rewards/reward_len/mean": 0.9326171875, "rewards/reward_len/std": 0.20988774299621582, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0752, "frac_reward_zero_std": 0.375, "grad_norm": 0.8573291897773743, "kl": 0.13603922724723816, "learning_rate": 4.992317963306271e-06, "loss": 0.0003, "num_tokens": 46161984.0, "reward": 0.8518880605697632, "reward_std": 0.10121491551399231, "rewards/reward_len/mean": 0.8518880009651184, "rewards/reward_len/std": 0.2907576560974121, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07552, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5990524291992188, "kl": 0.117702916264534, "learning_rate": 4.992252198584554e-06, "loss": 0.0002, "num_tokens": 46358592.0, "reward": 0.9384765625, "reward_std": 0.10570720583200455, "rewards/reward_len/mean": 0.9384765625, "rewards/reward_len/std": 0.19909003376960754, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07584, "frac_reward_zero_std": 0.625, "grad_norm": 0.6050568222999573, "kl": 0.12044993788003922, "learning_rate": 4.992186153997458e-06, "loss": 0.0002, "num_tokens": 46554976.0, "reward": 0.9290364980697632, "reward_std": 0.07565077394247055, "rewards/reward_len/mean": 0.9290364980697632, "rewards/reward_len/std": 0.19233398139476776, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07616, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4187593162059784, "kl": 0.1489473283290863, "learning_rate": 4.992119829552401e-06, "loss": 0.0003, "num_tokens": 46751664.0, "reward": 0.9274088740348816, "reward_std": 0.05451836809515953, "rewards/reward_len/mean": 0.9274088740348816, "rewards/reward_len/std": 0.19334982335567474, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07648, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7127272486686707, "kl": 0.12710312008857727, "learning_rate": 4.992053225256829e-06, "loss": 0.0003, "num_tokens": 46948336.0, "reward": 0.9449869990348816, "reward_std": 0.10750681161880493, "rewards/reward_len/mean": 0.9449869990348816, "rewards/reward_len/std": 0.16497786343097687, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0768, "frac_reward_zero_std": 0.5, "grad_norm": 0.6521912813186646, "kl": 0.1454586535692215, "learning_rate": 4.991986341118221e-06, "loss": 0.0003, "num_tokens": 47144960.0, "reward": 0.9065755605697632, "reward_std": 0.11040978133678436, "rewards/reward_len/mean": 0.9065755009651184, "rewards/reward_len/std": 0.2635875344276428, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07712, "frac_reward_zero_std": 0.6875, "grad_norm": 0.605106770992279, "kl": 0.12508417665958405, "learning_rate": 4.9919191771440905e-06, "loss": 0.0003, "num_tokens": 47341520.0, "reward": 0.8958333730697632, "reward_std": 0.0782233476638794, "rewards/reward_len/mean": 0.8958333730697632, "rewards/reward_len/std": 0.24632373452186584, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07744, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6580401659011841, "kl": 0.11791373044252396, "learning_rate": 4.991851733341977e-06, "loss": 0.0002, "num_tokens": 47537872.0, "reward": 0.880859375, "reward_std": 0.06998249888420105, "rewards/reward_len/mean": 0.880859375, "rewards/reward_len/std": 0.2494402527809143, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07776, "frac_reward_zero_std": 0.5, "grad_norm": 1.1426188945770264, "kl": 0.14084668457508087, "learning_rate": 4.991784009719455e-06, "loss": 0.0003, "num_tokens": 47734384.0, "reward": 0.8759765625, "reward_std": 0.10268674790859222, "rewards/reward_len/mean": 0.8759765625, "rewards/reward_len/std": 0.2955721318721771, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07808, "frac_reward_zero_std": 0.5, "grad_norm": 0.7229583859443665, "kl": 0.1390959769487381, "learning_rate": 4.991716006284129e-06, "loss": 0.0003, "num_tokens": 47930736.0, "reward": 0.8746744990348816, "reward_std": 0.07177183032035828, "rewards/reward_len/mean": 0.8746744990348816, "rewards/reward_len/std": 0.2818973660469055, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0784, "frac_reward_zero_std": 0.625, "grad_norm": 0.7337759137153625, "kl": 0.12047222256660461, "learning_rate": 4.991647723043636e-06, "loss": 0.0002, "num_tokens": 48127152.0, "reward": 0.9130859375, "reward_std": 0.07388008385896683, "rewards/reward_len/mean": 0.9130859375, "rewards/reward_len/std": 0.21696354448795319, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07872, "frac_reward_zero_std": 0.625, "grad_norm": 0.5843394994735718, "kl": 0.13104906678199768, "learning_rate": 4.991579160005644e-06, "loss": 0.0003, "num_tokens": 48323728.0, "reward": 0.9544271230697632, "reward_std": 0.06581531465053558, "rewards/reward_len/mean": 0.9544271230697632, "rewards/reward_len/std": 0.1376187801361084, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07904, "frac_reward_zero_std": 0.5, "grad_norm": 0.7412914037704468, "kl": 0.137121319770813, "learning_rate": 4.991510317177851e-06, "loss": 0.0003, "num_tokens": 48520160.0, "reward": 0.8844401240348816, "reward_std": 0.11447978019714355, "rewards/reward_len/mean": 0.8844400644302368, "rewards/reward_len/std": 0.2893683910369873, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07936, "frac_reward_zero_std": 0.5, "grad_norm": 0.6369826197624207, "kl": 0.12548300623893738, "learning_rate": 4.9914411945679884e-06, "loss": 0.0003, "num_tokens": 48716400.0, "reward": 0.9791666865348816, "reward_std": 0.06467807292938232, "rewards/reward_len/mean": 0.9791666269302368, "rewards/reward_len/std": 0.0956573411822319, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.07968, "frac_reward_zero_std": 0.5, "grad_norm": 0.7735297679901123, "kl": 0.1304980218410492, "learning_rate": 4.991371792183818e-06, "loss": 0.0003, "num_tokens": 48912928.0, "reward": 0.9202474355697632, "reward_std": 0.07831915467977524, "rewards/reward_len/mean": 0.9202473759651184, "rewards/reward_len/std": 0.18081510066986084, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08, "frac_reward_zero_std": 0.6875, "grad_norm": 0.49821528792381287, "kl": 0.13353890180587769, "learning_rate": 4.9913021100331344e-06, "loss": 0.0003, "num_tokens": 49109440.0, "reward": 0.9720052480697632, "reward_std": 0.037427812814712524, "rewards/reward_len/mean": 0.9720052480697632, "rewards/reward_len/std": 0.103730708360672, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08032, "frac_reward_zero_std": 0.5, "grad_norm": 0.8171219825744629, "kl": 0.14515572786331177, "learning_rate": 4.9912321481237616e-06, "loss": 0.0003, "num_tokens": 49305904.0, "reward": 0.9166666865348816, "reward_std": 0.0894092321395874, "rewards/reward_len/mean": 0.9166666865348816, "rewards/reward_len/std": 0.2331465184688568, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08064, "frac_reward_zero_std": 0.625, "grad_norm": 0.8101213574409485, "kl": 0.1421227753162384, "learning_rate": 4.991161906463555e-06, "loss": 0.0003, "num_tokens": 49502176.0, "reward": 0.9248046875, "reward_std": 0.09208990633487701, "rewards/reward_len/mean": 0.9248046875, "rewards/reward_len/std": 0.21494784951210022, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08096, "frac_reward_zero_std": 0.375, "grad_norm": 0.761877179145813, "kl": 0.14815253019332886, "learning_rate": 4.9910913850604035e-06, "loss": 0.0003, "num_tokens": 49698416.0, "reward": 0.8665364980697632, "reward_std": 0.15526506304740906, "rewards/reward_len/mean": 0.8665364980697632, "rewards/reward_len/std": 0.2492559403181076, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08128, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5470792651176453, "kl": 0.14317984879016876, "learning_rate": 4.991020583922227e-06, "loss": 0.0003, "num_tokens": 49894928.0, "reward": 0.8434244990348816, "reward_std": 0.09754301607608795, "rewards/reward_len/mean": 0.8434244394302368, "rewards/reward_len/std": 0.36786961555480957, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0816, "frac_reward_zero_std": 0.625, "grad_norm": 0.6213687062263489, "kl": 0.1466769576072693, "learning_rate": 4.9909495030569744e-06, "loss": 0.0003, "num_tokens": 50091232.0, "reward": 0.8844401240348816, "reward_std": 0.11906179785728455, "rewards/reward_len/mean": 0.8844400644302368, "rewards/reward_len/std": 0.28852012753486633, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08192, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6403785347938538, "kl": 0.12964850664138794, "learning_rate": 4.990878142472628e-06, "loss": 0.0003, "num_tokens": 50287584.0, "reward": 0.9778646230697632, "reward_std": 0.06314116716384888, "rewards/reward_len/mean": 0.9778645634651184, "rewards/reward_len/std": 0.11853517591953278, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08224, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8144481778144836, "kl": 0.14283603429794312, "learning_rate": 4.9908065021772025e-06, "loss": 0.0003, "num_tokens": 50484128.0, "reward": 0.966796875, "reward_std": 0.09816902130842209, "rewards/reward_len/mean": 0.966796875, "rewards/reward_len/std": 0.136617973446846, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08256, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5882145166397095, "kl": 0.14350882172584534, "learning_rate": 4.9907345821787415e-06, "loss": 0.0003, "num_tokens": 50680608.0, "reward": 0.9319661855697632, "reward_std": 0.09338431805372238, "rewards/reward_len/mean": 0.9319661855697632, "rewards/reward_len/std": 0.18774919211864471, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08288, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4698895812034607, "kl": 0.12436877936124802, "learning_rate": 4.990662382485321e-06, "loss": 0.0002, "num_tokens": 50877152.0, "reward": 0.8990885615348816, "reward_std": 0.080576092004776, "rewards/reward_len/mean": 0.8990885615348816, "rewards/reward_len/std": 0.2703891098499298, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0832, "frac_reward_zero_std": 0.625, "grad_norm": 0.494366854429245, "kl": 0.15376798808574677, "learning_rate": 4.990589903105049e-06, "loss": 0.0003, "num_tokens": 51073376.0, "reward": 0.9225260615348816, "reward_std": 0.05913439020514488, "rewards/reward_len/mean": 0.9225260019302368, "rewards/reward_len/std": 0.2430255115032196, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08352, "frac_reward_zero_std": 0.625, "grad_norm": 0.5663818120956421, "kl": 0.14309410750865936, "learning_rate": 4.9905171440460645e-06, "loss": 0.0003, "num_tokens": 51270000.0, "reward": 0.9794921875, "reward_std": 0.06855131685733795, "rewards/reward_len/mean": 0.9794921875, "rewards/reward_len/std": 0.11355515569448471, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08384, "frac_reward_zero_std": 0.3125, "grad_norm": 0.7620944380760193, "kl": 0.20061814785003662, "learning_rate": 4.990444105316538e-06, "loss": 0.0004, "num_tokens": 51466656.0, "reward": 0.8203125, "reward_std": 0.1335533857345581, "rewards/reward_len/mean": 0.8203125, "rewards/reward_len/std": 0.3628196120262146, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08416, "frac_reward_zero_std": 0.5625, "grad_norm": 0.62797611951828, "kl": 0.13191480934619904, "learning_rate": 4.9903707869246705e-06, "loss": 0.0003, "num_tokens": 51663376.0, "reward": 0.9384765625, "reward_std": 0.08720585703849792, "rewards/reward_len/mean": 0.9384765625, "rewards/reward_len/std": 0.16181373596191406, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08448, "frac_reward_zero_std": 0.5625, "grad_norm": 0.783308744430542, "kl": 0.1400223970413208, "learning_rate": 4.9902971888786974e-06, "loss": 0.0003, "num_tokens": 51859936.0, "reward": 0.8792318105697632, "reward_std": 0.09938423335552216, "rewards/reward_len/mean": 0.8792318105697632, "rewards/reward_len/std": 0.2708284258842468, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0848, "frac_reward_zero_std": 0.25, "grad_norm": 0.7975671887397766, "kl": 0.1555856466293335, "learning_rate": 4.990223311186881e-06, "loss": 0.0003, "num_tokens": 52056624.0, "reward": 0.8297526240348816, "reward_std": 0.16079317033290863, "rewards/reward_len/mean": 0.8297525644302368, "rewards/reward_len/std": 0.29770946502685547, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08512, "frac_reward_zero_std": 0.5, "grad_norm": 0.6111416220664978, "kl": 0.1495470404624939, "learning_rate": 4.990149153857519e-06, "loss": 0.0003, "num_tokens": 52253120.0, "reward": 0.8909505605697632, "reward_std": 0.09638872742652893, "rewards/reward_len/mean": 0.8909505605697632, "rewards/reward_len/std": 0.25536835193634033, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08544, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7845742106437683, "kl": 0.15708449482917786, "learning_rate": 4.9900747168989375e-06, "loss": 0.0003, "num_tokens": 52449328.0, "reward": 0.8701171875, "reward_std": 0.103302001953125, "rewards/reward_len/mean": 0.8701171875, "rewards/reward_len/std": 0.30442994832992554, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08576, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7046849727630615, "kl": 0.18841896951198578, "learning_rate": 4.990000000319496e-06, "loss": 0.0004, "num_tokens": 52645872.0, "reward": 0.9375, "reward_std": 0.13257798552513123, "rewards/reward_len/mean": 0.9375, "rewards/reward_len/std": 0.22164960205554962, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08608, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6093775033950806, "kl": 0.14093530178070068, "learning_rate": 4.989925004127584e-06, "loss": 0.0003, "num_tokens": 52842400.0, "reward": 0.9720052480697632, "reward_std": 0.04746638983488083, "rewards/reward_len/mean": 0.9720051884651184, "rewards/reward_len/std": 0.10134033858776093, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0864, "frac_reward_zero_std": 0.5, "grad_norm": 0.7358813285827637, "kl": 0.14359839260578156, "learning_rate": 4.989849728331625e-06, "loss": 0.0003, "num_tokens": 53038816.0, "reward": 0.9020182490348816, "reward_std": 0.1082427054643631, "rewards/reward_len/mean": 0.9020182490348816, "rewards/reward_len/std": 0.25054895877838135, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08672, "frac_reward_zero_std": 0.8125, "grad_norm": 0.38457974791526794, "kl": 0.1408851444721222, "learning_rate": 4.989774172940071e-06, "loss": 0.0003, "num_tokens": 53235376.0, "reward": 0.9505208730697632, "reward_std": 0.0312499962747097, "rewards/reward_len/mean": 0.9505208134651184, "rewards/reward_len/std": 0.17509335279464722, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08704, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6539224982261658, "kl": 0.18015702068805695, "learning_rate": 4.989698337961405e-06, "loss": 0.0004, "num_tokens": 53431840.0, "reward": 0.904296875, "reward_std": 0.09653319418430328, "rewards/reward_len/mean": 0.904296875, "rewards/reward_len/std": 0.23572301864624023, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08736, "frac_reward_zero_std": 0.375, "grad_norm": 0.8908942937850952, "kl": 0.17987269163131714, "learning_rate": 4.989622223404144e-06, "loss": 0.0004, "num_tokens": 53628416.0, "reward": 0.8600260615348816, "reward_std": 0.1092572882771492, "rewards/reward_len/mean": 0.8600260019302368, "rewards/reward_len/std": 0.2693358063697815, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08768, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6653320789337158, "kl": 0.13816805183887482, "learning_rate": 4.989545829276836e-06, "loss": 0.0003, "num_tokens": 53824800.0, "reward": 0.9612630605697632, "reward_std": 0.07954277098178864, "rewards/reward_len/mean": 0.9612630605697632, "rewards/reward_len/std": 0.14849311113357544, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.088, "frac_reward_zero_std": 0.6875, "grad_norm": 0.7500613331794739, "kl": 0.16194546222686768, "learning_rate": 4.989469155588059e-06, "loss": 0.0003, "num_tokens": 54021232.0, "reward": 0.9466146230697632, "reward_std": 0.07759144902229309, "rewards/reward_len/mean": 0.9466145634651184, "rewards/reward_len/std": 0.18894685804843903, "step": 275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08832, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6577062010765076, "kl": 0.14706847071647644, "learning_rate": 4.989392202346423e-06, "loss": 0.0003, "num_tokens": 54217328.0, "reward": 0.9459635615348816, "reward_std": 0.056596677750349045, "rewards/reward_len/mean": 0.9459635019302368, "rewards/reward_len/std": 0.16104568541049957, "step": 276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08864, "frac_reward_zero_std": 0.625, "grad_norm": 0.6757554411888123, "kl": 0.13412651419639587, "learning_rate": 4.989314969560569e-06, "loss": 0.0003, "num_tokens": 54413872.0, "reward": 0.9593099355697632, "reward_std": 0.07796579599380493, "rewards/reward_len/mean": 0.9593098759651184, "rewards/reward_len/std": 0.1616874635219574, "step": 277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08896, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7473012804985046, "kl": 0.1465809941291809, "learning_rate": 4.98923745723917e-06, "loss": 0.0003, "num_tokens": 54610304.0, "reward": 0.9264323115348816, "reward_std": 0.0934956818819046, "rewards/reward_len/mean": 0.9264323115348816, "rewards/reward_len/std": 0.2292882651090622, "step": 278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08928, "frac_reward_zero_std": 0.625, "grad_norm": 0.6172903776168823, "kl": 0.15288451313972473, "learning_rate": 4.989159665390931e-06, "loss": 0.0003, "num_tokens": 54806816.0, "reward": 0.9401041865348816, "reward_std": 0.08339567482471466, "rewards/reward_len/mean": 0.9401041865348816, "rewards/reward_len/std": 0.1738446205854416, "step": 279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0896, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6484739184379578, "kl": 0.16665202379226685, "learning_rate": 4.989081594024586e-06, "loss": 0.0003, "num_tokens": 55003216.0, "reward": 0.9348958730697632, "reward_std": 0.1041487455368042, "rewards/reward_len/mean": 0.9348958134651184, "rewards/reward_len/std": 0.22163423895835876, "step": 280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.08992, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6608350276947021, "kl": 0.14337918162345886, "learning_rate": 4.989003243148904e-06, "loss": 0.0003, "num_tokens": 55199680.0, "reward": 0.8193359375, "reward_std": 0.08847261220216751, "rewards/reward_len/mean": 0.8193359375, "rewards/reward_len/std": 0.37369152903556824, "step": 281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09024, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7994632720947266, "kl": 0.15685515105724335, "learning_rate": 4.988924612772682e-06, "loss": 0.0003, "num_tokens": 55396192.0, "reward": 0.84375, "reward_std": 0.1226121187210083, "rewards/reward_len/mean": 0.84375, "rewards/reward_len/std": 0.30186423659324646, "step": 282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09056, "frac_reward_zero_std": 0.5, "grad_norm": 0.7137040495872498, "kl": 0.1474229395389557, "learning_rate": 4.9888457029047485e-06, "loss": 0.0003, "num_tokens": 55592400.0, "reward": 0.9202474355697632, "reward_std": 0.1267140507698059, "rewards/reward_len/mean": 0.9202474355697632, "rewards/reward_len/std": 0.22786110639572144, "step": 283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09088, "frac_reward_zero_std": 0.5, "grad_norm": 0.8168827295303345, "kl": 0.16146087646484375, "learning_rate": 4.988766513553967e-06, "loss": 0.0003, "num_tokens": 55788864.0, "reward": 0.9339193105697632, "reward_std": 0.116163469851017, "rewards/reward_len/mean": 0.9339192509651184, "rewards/reward_len/std": 0.22890448570251465, "step": 284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0912, "frac_reward_zero_std": 0.625, "grad_norm": 0.6408174633979797, "kl": 0.17142651975154877, "learning_rate": 4.988687044729229e-06, "loss": 0.0003, "num_tokens": 55985392.0, "reward": 0.916015625, "reward_std": 0.0726352334022522, "rewards/reward_len/mean": 0.916015625, "rewards/reward_len/std": 0.2932605743408203, "step": 285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09152, "frac_reward_zero_std": 0.625, "grad_norm": 0.5500815510749817, "kl": 0.17726555466651917, "learning_rate": 4.988607296439459e-06, "loss": 0.0004, "num_tokens": 56181952.0, "reward": 0.9645182490348816, "reward_std": 0.07057779282331467, "rewards/reward_len/mean": 0.9645181894302368, "rewards/reward_len/std": 0.1395009160041809, "step": 286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09184, "frac_reward_zero_std": 0.625, "grad_norm": 0.6344060301780701, "kl": 0.1837608814239502, "learning_rate": 4.988527268693611e-06, "loss": 0.0004, "num_tokens": 56378592.0, "reward": 0.9430338740348816, "reward_std": 0.052120909094810486, "rewards/reward_len/mean": 0.9430338144302368, "rewards/reward_len/std": 0.18252725899219513, "step": 287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09216, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7529948949813843, "kl": 0.16018211841583252, "learning_rate": 4.988446961500672e-06, "loss": 0.0003, "num_tokens": 56575072.0, "reward": 0.9791666865348816, "reward_std": 0.07178077101707458, "rewards/reward_len/mean": 0.9791666269302368, "rewards/reward_len/std": 0.11143743246793747, "step": 288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09248, "frac_reward_zero_std": 0.5, "grad_norm": 0.6906608939170837, "kl": 0.1795535534620285, "learning_rate": 4.98836637486966e-06, "loss": 0.0004, "num_tokens": 56771488.0, "reward": 0.83203125, "reward_std": 0.09755201637744904, "rewards/reward_len/mean": 0.83203125, "rewards/reward_len/std": 0.29850468039512634, "step": 289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0928, "frac_reward_zero_std": 0.4375, "grad_norm": 0.9807594418525696, "kl": 0.1846751868724823, "learning_rate": 4.988285508809626e-06, "loss": 0.0004, "num_tokens": 56967584.0, "reward": 0.8990885615348816, "reward_std": 0.11302575469017029, "rewards/reward_len/mean": 0.8990885615348816, "rewards/reward_len/std": 0.23224592208862305, "step": 290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09312, "frac_reward_zero_std": 0.75, "grad_norm": 0.5358194708824158, "kl": 0.16956226527690887, "learning_rate": 4.988204363329648e-06, "loss": 0.0003, "num_tokens": 57164032.0, "reward": 0.892578125, "reward_std": 0.06409033387899399, "rewards/reward_len/mean": 0.892578125, "rewards/reward_len/std": 0.24855844676494598, "step": 291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09344, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4902224540710449, "kl": 0.21661758422851562, "learning_rate": 4.9881229384388416e-06, "loss": 0.0004, "num_tokens": 57360416.0, "reward": 0.8330078125, "reward_std": 0.06817319989204407, "rewards/reward_len/mean": 0.8330078125, "rewards/reward_len/std": 0.3030795156955719, "step": 292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09376, "frac_reward_zero_std": 0.625, "grad_norm": 0.8213449716567993, "kl": 0.17561742663383484, "learning_rate": 4.988041234146348e-06, "loss": 0.0004, "num_tokens": 57556720.0, "reward": 0.9739583730697632, "reward_std": 0.055555492639541626, "rewards/reward_len/mean": 0.9739583730697632, "rewards/reward_len/std": 0.10707540810108185, "step": 293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09408, "frac_reward_zero_std": 0.625, "grad_norm": 0.6258949041366577, "kl": 0.24912530183792114, "learning_rate": 4.987959250461341e-06, "loss": 0.0005, "num_tokens": 57753264.0, "reward": 0.873046875, "reward_std": 0.0752042829990387, "rewards/reward_len/mean": 0.873046875, "rewards/reward_len/std": 0.22901344299316406, "step": 294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0944, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4788155257701874, "kl": 0.16778747737407684, "learning_rate": 4.98787698739303e-06, "loss": 0.0003, "num_tokens": 57949760.0, "reward": 0.9427083730697632, "reward_std": 0.06510069966316223, "rewards/reward_len/mean": 0.9427083730697632, "rewards/reward_len/std": 0.17062297463417053, "step": 295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09472, "frac_reward_zero_std": 0.75, "grad_norm": 0.4588495194911957, "kl": 0.19027698040008545, "learning_rate": 4.987794444950651e-06, "loss": 0.0004, "num_tokens": 58146368.0, "reward": 0.9560546875, "reward_std": 0.0299479179084301, "rewards/reward_len/mean": 0.9560546875, "rewards/reward_len/std": 0.16401556134223938, "step": 296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09504, "frac_reward_zero_std": 0.75, "grad_norm": 0.5854141116142273, "kl": 0.18310603499412537, "learning_rate": 4.987711623143473e-06, "loss": 0.0004, "num_tokens": 58342976.0, "reward": 0.9710286855697632, "reward_std": 0.054185446351766586, "rewards/reward_len/mean": 0.9710286259651184, "rewards/reward_len/std": 0.14061592519283295, "step": 297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09536, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6744587421417236, "kl": 0.18638305366039276, "learning_rate": 4.987628521980796e-06, "loss": 0.0004, "num_tokens": 58539424.0, "reward": 0.9485677480697632, "reward_std": 0.08318411558866501, "rewards/reward_len/mean": 0.9485677480697632, "rewards/reward_len/std": 0.1768524944782257, "step": 298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09568, "frac_reward_zero_std": 0.625, "grad_norm": 0.628987729549408, "kl": 0.197912335395813, "learning_rate": 4.9875451414719535e-06, "loss": 0.0004, "num_tokens": 58735904.0, "reward": 0.9417318105697632, "reward_std": 0.08431451767683029, "rewards/reward_len/mean": 0.9417317509651184, "rewards/reward_len/std": 0.196083664894104, "step": 299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.096, "frac_reward_zero_std": 0.375, "grad_norm": 0.7516345381736755, "kl": 0.25001639127731323, "learning_rate": 4.987461481626307e-06, "loss": 0.0005, "num_tokens": 58932304.0, "reward": 0.8818359375, "reward_std": 0.14079833030700684, "rewards/reward_len/mean": 0.8818359375, "rewards/reward_len/std": 0.28016671538352966, "step": 300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09632, "frac_reward_zero_std": 0.5625, "grad_norm": 0.9488049745559692, "kl": 0.1855567991733551, "learning_rate": 4.9873775424532515e-06, "loss": 0.0004, "num_tokens": 59128544.0, "reward": 0.958984375, "reward_std": 0.09067913144826889, "rewards/reward_len/mean": 0.958984375, "rewards/reward_len/std": 0.15052416920661926, "step": 301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09664, "frac_reward_zero_std": 0.75, "grad_norm": 0.42047029733657837, "kl": 0.2178574949502945, "learning_rate": 4.987293323962213e-06, "loss": 0.0004, "num_tokens": 59324784.0, "reward": 0.9739583730697632, "reward_std": 0.051922693848609924, "rewards/reward_len/mean": 0.9739583134651184, "rewards/reward_len/std": 0.16143757104873657, "step": 302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09696, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8898394703865051, "kl": 0.19446057081222534, "learning_rate": 4.987208826162647e-06, "loss": 0.0004, "num_tokens": 59521424.0, "reward": 0.931640625, "reward_std": 0.11750184744596481, "rewards/reward_len/mean": 0.931640625, "rewards/reward_len/std": 0.2089848816394806, "step": 303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09728, "frac_reward_zero_std": 0.625, "grad_norm": 0.603395402431488, "kl": 0.22529876232147217, "learning_rate": 4.9871240490640445e-06, "loss": 0.0005, "num_tokens": 59717696.0, "reward": 0.9534505605697632, "reward_std": 0.06372801959514618, "rewards/reward_len/mean": 0.9534505605697632, "rewards/reward_len/std": 0.15992245078086853, "step": 304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0976, "frac_reward_zero_std": 0.5, "grad_norm": 0.686753511428833, "kl": 0.17430123686790466, "learning_rate": 4.987038992675926e-06, "loss": 0.0003, "num_tokens": 59914208.0, "reward": 0.875, "reward_std": 0.09746825695037842, "rewards/reward_len/mean": 0.875, "rewards/reward_len/std": 0.236048623919487, "step": 305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09792, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7454960346221924, "kl": 0.19209128618240356, "learning_rate": 4.986953657007841e-06, "loss": 0.0004, "num_tokens": 60110672.0, "reward": 0.9280598759651184, "reward_std": 0.10386554896831512, "rewards/reward_len/mean": 0.9280599355697632, "rewards/reward_len/std": 0.2175706923007965, "step": 306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09824, "frac_reward_zero_std": 0.625, "grad_norm": 0.7476916313171387, "kl": 0.18301215767860413, "learning_rate": 4.986868042069372e-06, "loss": 0.0004, "num_tokens": 60307056.0, "reward": 0.912109375, "reward_std": 0.12449096143245697, "rewards/reward_len/mean": 0.912109375, "rewards/reward_len/std": 0.3432842493057251, "step": 307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09856, "frac_reward_zero_std": 0.625, "grad_norm": 0.9659981727600098, "kl": 0.23689834773540497, "learning_rate": 4.986782147870134e-06, "loss": 0.0005, "num_tokens": 60503584.0, "reward": 0.9251302480697632, "reward_std": 0.10527074337005615, "rewards/reward_len/mean": 0.9251302480697632, "rewards/reward_len/std": 0.25508764386177063, "step": 308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09888, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6019448041915894, "kl": 0.1772214025259018, "learning_rate": 4.986695974419773e-06, "loss": 0.0004, "num_tokens": 60699872.0, "reward": 0.9156901240348816, "reward_std": 0.08635516464710236, "rewards/reward_len/mean": 0.9156901240348816, "rewards/reward_len/std": 0.20136821269989014, "step": 309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.0992, "frac_reward_zero_std": 0.625, "grad_norm": 0.5572085976600647, "kl": 0.27463990449905396, "learning_rate": 4.986609521727964e-06, "loss": 0.0005, "num_tokens": 60896288.0, "reward": 0.7626953125, "reward_std": 0.11268115043640137, "rewards/reward_len/mean": 0.7626953125, "rewards/reward_len/std": 0.42565658688545227, "step": 310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09952, "frac_reward_zero_std": 0.75, "grad_norm": 0.4231628179550171, "kl": 0.17810611426830292, "learning_rate": 4.986522789804417e-06, "loss": 0.0004, "num_tokens": 61092672.0, "reward": 0.9137369990348816, "reward_std": 0.05028747767210007, "rewards/reward_len/mean": 0.9137369990348816, "rewards/reward_len/std": 0.21124978363513947, "step": 311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.09984, "frac_reward_zero_std": 0.6875, "grad_norm": 0.7284014225006104, "kl": 0.20978215336799622, "learning_rate": 4.98643577865887e-06, "loss": 0.0004, "num_tokens": 61289056.0, "reward": 0.9065755009651184, "reward_std": 0.06528086215257645, "rewards/reward_len/mean": 0.9065755009651184, "rewards/reward_len/std": 0.2469446361064911, "step": 312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10016, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7000886797904968, "kl": 0.2989138066768646, "learning_rate": 4.9863484883010945e-06, "loss": 0.0006, "num_tokens": 61485584.0, "reward": 0.8707682490348816, "reward_std": 0.1197974681854248, "rewards/reward_len/mean": 0.8707682490348816, "rewards/reward_len/std": 0.28854963183403015, "step": 313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10048, "frac_reward_zero_std": 0.375, "grad_norm": 0.8791053295135498, "kl": 0.20757333934307098, "learning_rate": 4.986260918740894e-06, "loss": 0.0004, "num_tokens": 61682160.0, "reward": 0.9176432490348816, "reward_std": 0.14943909645080566, "rewards/reward_len/mean": 0.9176431894302368, "rewards/reward_len/std": 0.24890637397766113, "step": 314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1008, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6520059704780579, "kl": 0.2021569013595581, "learning_rate": 4.986173069988099e-06, "loss": 0.0004, "num_tokens": 61878464.0, "reward": 0.8740234375, "reward_std": 0.06649763882160187, "rewards/reward_len/mean": 0.8740234375, "rewards/reward_len/std": 0.29288795590400696, "step": 315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10112, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6075562834739685, "kl": 0.1936589479446411, "learning_rate": 4.986084942052577e-06, "loss": 0.0004, "num_tokens": 62075120.0, "reward": 0.958984375, "reward_std": 0.04871319234371185, "rewards/reward_len/mean": 0.958984375, "rewards/reward_len/std": 0.1320180594921112, "step": 316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10144, "frac_reward_zero_std": 0.5625, "grad_norm": 0.8195614814758301, "kl": 0.2234935611486435, "learning_rate": 4.985996534944223e-06, "loss": 0.0004, "num_tokens": 62271632.0, "reward": 0.8857421875, "reward_std": 0.09317219257354736, "rewards/reward_len/mean": 0.8857421875, "rewards/reward_len/std": 0.26080915331840515, "step": 317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10176, "frac_reward_zero_std": 0.625, "grad_norm": 0.5947416424751282, "kl": 0.24945947527885437, "learning_rate": 4.985907848672965e-06, "loss": 0.0005, "num_tokens": 62467888.0, "reward": 0.8909505009651184, "reward_std": 0.08388829231262207, "rewards/reward_len/mean": 0.8909505605697632, "rewards/reward_len/std": 0.2681647539138794, "step": 318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10208, "frac_reward_zero_std": 0.5, "grad_norm": 0.9540805816650391, "kl": 0.22906923294067383, "learning_rate": 4.985818883248762e-06, "loss": 0.0005, "num_tokens": 62664432.0, "reward": 0.8753255605697632, "reward_std": 0.11415161192417145, "rewards/reward_len/mean": 0.8753255605697632, "rewards/reward_len/std": 0.26092657446861267, "step": 319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1024, "frac_reward_zero_std": 0.5, "grad_norm": 0.7740412354469299, "kl": 0.23285314440727234, "learning_rate": 4.985729638681604e-06, "loss": 0.0005, "num_tokens": 62860912.0, "reward": 0.791015625, "reward_std": 0.116269052028656, "rewards/reward_len/mean": 0.791015625, "rewards/reward_len/std": 0.29557302594184875, "step": 320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10272, "frac_reward_zero_std": 0.625, "grad_norm": 0.7258080840110779, "kl": 0.21934401988983154, "learning_rate": 4.985640114981513e-06, "loss": 0.0004, "num_tokens": 63057136.0, "reward": 0.91015625, "reward_std": 0.07462921738624573, "rewards/reward_len/mean": 0.91015625, "rewards/reward_len/std": 0.22375518083572388, "step": 321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10304, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5900440216064453, "kl": 0.23094084858894348, "learning_rate": 4.985550312158541e-06, "loss": 0.0005, "num_tokens": 63253424.0, "reward": 0.98046875, "reward_std": 0.05243149772286415, "rewards/reward_len/mean": 0.98046875, "rewards/reward_len/std": 0.09928084164857864, "step": 322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10336, "frac_reward_zero_std": 0.75, "grad_norm": 0.4705430865287781, "kl": 0.1974773406982422, "learning_rate": 4.985460230222775e-06, "loss": 0.0004, "num_tokens": 63449760.0, "reward": 0.9765625, "reward_std": 0.04583838954567909, "rewards/reward_len/mean": 0.9765625, "rewards/reward_len/std": 0.12302358448505402, "step": 323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10368, "frac_reward_zero_std": 0.625, "grad_norm": 0.766782283782959, "kl": 0.20896635949611664, "learning_rate": 4.9853698691843275e-06, "loss": 0.0004, "num_tokens": 63646304.0, "reward": 0.9674479365348816, "reward_std": 0.07366439700126648, "rewards/reward_len/mean": 0.9674478769302368, "rewards/reward_len/std": 0.13577598333358765, "step": 324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.104, "frac_reward_zero_std": 0.5, "grad_norm": 0.8756688237190247, "kl": 0.23943573236465454, "learning_rate": 4.985279229053347e-06, "loss": 0.0005, "num_tokens": 63842960.0, "reward": 0.9000651240348816, "reward_std": 0.12203947454690933, "rewards/reward_len/mean": 0.9000650644302368, "rewards/reward_len/std": 0.22668156027793884, "step": 325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10432, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7217138409614563, "kl": 0.21665823459625244, "learning_rate": 4.985188309840012e-06, "loss": 0.0004, "num_tokens": 64039232.0, "reward": 0.9176432490348816, "reward_std": 0.08387972414493561, "rewards/reward_len/mean": 0.9176431894302368, "rewards/reward_len/std": 0.2151002287864685, "step": 326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10464, "frac_reward_zero_std": 0.375, "grad_norm": 0.7947877049446106, "kl": 0.2119433432817459, "learning_rate": 4.9850971115545314e-06, "loss": 0.0004, "num_tokens": 64235824.0, "reward": 0.8671875, "reward_std": 0.10202635824680328, "rewards/reward_len/mean": 0.8671875, "rewards/reward_len/std": 0.24575646221637726, "step": 327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10496, "frac_reward_zero_std": 0.375, "grad_norm": 0.8719542622566223, "kl": 0.21096409857273102, "learning_rate": 4.9850056342071474e-06, "loss": 0.0004, "num_tokens": 64432400.0, "reward": 0.9381510615348816, "reward_std": 0.08992186188697815, "rewards/reward_len/mean": 0.9381510019302368, "rewards/reward_len/std": 0.1641131490468979, "step": 328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10528, "frac_reward_zero_std": 0.625, "grad_norm": 0.6044608950614929, "kl": 0.22195185720920563, "learning_rate": 4.984913877808132e-06, "loss": 0.0004, "num_tokens": 64628832.0, "reward": 0.9436849355697632, "reward_std": 0.08515959978103638, "rewards/reward_len/mean": 0.9436849355697632, "rewards/reward_len/std": 0.20638565719127655, "step": 329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1056, "frac_reward_zero_std": 0.75, "grad_norm": 0.5217956900596619, "kl": 0.24507224559783936, "learning_rate": 4.984821842367787e-06, "loss": 0.0005, "num_tokens": 64825232.0, "reward": 0.9856771230697632, "reward_std": 0.03533861041069031, "rewards/reward_len/mean": 0.9856770634651184, "rewards/reward_len/std": 0.07712803781032562, "step": 330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10592, "frac_reward_zero_std": 0.4375, "grad_norm": 0.7606909871101379, "kl": 0.23449228703975677, "learning_rate": 4.984729527896451e-06, "loss": 0.0005, "num_tokens": 65021712.0, "reward": 0.8697916865348816, "reward_std": 0.10330867767333984, "rewards/reward_len/mean": 0.8697916269302368, "rewards/reward_len/std": 0.2593039870262146, "step": 331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10624, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6931866407394409, "kl": 0.2839498221874237, "learning_rate": 4.984636934404488e-06, "loss": 0.0006, "num_tokens": 65217952.0, "reward": 0.9358724355697632, "reward_std": 0.09433399885892868, "rewards/reward_len/mean": 0.9358724355697632, "rewards/reward_len/std": 0.20112717151641846, "step": 332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10656, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7981671690940857, "kl": 0.2374555915594101, "learning_rate": 4.984544061902296e-06, "loss": 0.0005, "num_tokens": 65414064.0, "reward": 0.8990885615348816, "reward_std": 0.08965878933668137, "rewards/reward_len/mean": 0.8990885019302368, "rewards/reward_len/std": 0.2282242625951767, "step": 333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10688, "frac_reward_zero_std": 0.75, "grad_norm": 0.5681701302528381, "kl": 0.2422051727771759, "learning_rate": 4.984450910400304e-06, "loss": 0.0005, "num_tokens": 65610560.0, "reward": 0.9329427480697632, "reward_std": 0.06092479079961777, "rewards/reward_len/mean": 0.9329426884651184, "rewards/reward_len/std": 0.21276235580444336, "step": 334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1072, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4941754937171936, "kl": 0.2065250724554062, "learning_rate": 4.984357479908972e-06, "loss": 0.0004, "num_tokens": 65806960.0, "reward": 0.8583984375, "reward_std": 0.13218377530574799, "rewards/reward_len/mean": 0.8583984375, "rewards/reward_len/std": 0.36922162771224976, "step": 335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10752, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6207228899002075, "kl": 0.20608362555503845, "learning_rate": 4.984263770438793e-06, "loss": 0.0004, "num_tokens": 66003568.0, "reward": 0.8766276240348816, "reward_std": 0.07969387620687485, "rewards/reward_len/mean": 0.8766276240348816, "rewards/reward_len/std": 0.2525416612625122, "step": 336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10784, "frac_reward_zero_std": 0.625, "grad_norm": 0.6249269247055054, "kl": 0.32507622241973877, "learning_rate": 4.98416978200029e-06, "loss": 0.0007, "num_tokens": 66199808.0, "reward": 0.9889323115348816, "reward_std": 0.0442708320915699, "rewards/reward_len/mean": 0.9889323115348816, "rewards/reward_len/std": 0.0830853134393692, "step": 337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10816, "frac_reward_zero_std": 0.625, "grad_norm": 0.5374633073806763, "kl": 0.2590201795101166, "learning_rate": 4.984075514604015e-06, "loss": 0.0005, "num_tokens": 66396192.0, "reward": 0.9078776240348816, "reward_std": 0.06370966881513596, "rewards/reward_len/mean": 0.9078776240348816, "rewards/reward_len/std": 0.20009206235408783, "step": 338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10848, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6301341652870178, "kl": 0.22142821550369263, "learning_rate": 4.983980968260556e-06, "loss": 0.0004, "num_tokens": 66592768.0, "reward": 0.9466146230697632, "reward_std": 0.10067883133888245, "rewards/reward_len/mean": 0.9466146230697632, "rewards/reward_len/std": 0.1785728931427002, "step": 339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1088, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7572622299194336, "kl": 0.25199538469314575, "learning_rate": 4.983886142980531e-06, "loss": 0.0005, "num_tokens": 66789216.0, "reward": 0.8645833730697632, "reward_std": 0.11808818578720093, "rewards/reward_len/mean": 0.8645833730697632, "rewards/reward_len/std": 0.28180092573165894, "step": 340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10912, "frac_reward_zero_std": 0.625, "grad_norm": 0.8375306725502014, "kl": 0.268312931060791, "learning_rate": 4.983791038774585e-06, "loss": 0.0005, "num_tokens": 66985664.0, "reward": 0.9147135615348816, "reward_std": 0.08203423768281937, "rewards/reward_len/mean": 0.9147135615348816, "rewards/reward_len/std": 0.20398129522800446, "step": 341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10944, "frac_reward_zero_std": 0.625, "grad_norm": 0.6439992785453796, "kl": 0.22537115216255188, "learning_rate": 4.983695655653399e-06, "loss": 0.0005, "num_tokens": 67182256.0, "reward": 0.931640625, "reward_std": 0.09739615023136139, "rewards/reward_len/mean": 0.931640625, "rewards/reward_len/std": 0.20583365857601166, "step": 342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.10976, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3630291521549225, "kl": 0.32455378770828247, "learning_rate": 4.983599993627685e-06, "loss": 0.0006, "num_tokens": 67378752.0, "reward": 0.9326171875, "reward_std": 0.060417935252189636, "rewards/reward_len/mean": 0.9326171875, "rewards/reward_len/std": 0.23866578936576843, "step": 343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11008, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5213278532028198, "kl": 0.27187711000442505, "learning_rate": 4.9835040527081844e-06, "loss": 0.0005, "num_tokens": 67575200.0, "reward": 0.947265625, "reward_std": 0.07740997523069382, "rewards/reward_len/mean": 0.947265625, "rewards/reward_len/std": 0.169701486825943, "step": 344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1104, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6825294494628906, "kl": 0.2670292854309082, "learning_rate": 4.983407832905671e-06, "loss": 0.0005, "num_tokens": 67771520.0, "reward": 0.9599609375, "reward_std": 0.10208861529827118, "rewards/reward_len/mean": 0.9599609375, "rewards/reward_len/std": 0.14309662580490112, "step": 345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11072, "frac_reward_zero_std": 0.5, "grad_norm": 0.7872427105903625, "kl": 0.2529137134552002, "learning_rate": 4.9833113342309495e-06, "loss": 0.0005, "num_tokens": 67967904.0, "reward": 0.8427734375, "reward_std": 0.13876771926879883, "rewards/reward_len/mean": 0.8427734375, "rewards/reward_len/std": 0.3221031129360199, "step": 346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11104, "frac_reward_zero_std": 0.75, "grad_norm": 0.4888973832130432, "kl": 0.2349751889705658, "learning_rate": 4.9832145566948566e-06, "loss": 0.0005, "num_tokens": 68164352.0, "reward": 0.9661458730697632, "reward_std": 0.06749729812145233, "rewards/reward_len/mean": 0.9661458730697632, "rewards/reward_len/std": 0.155311718583107, "step": 347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11136, "frac_reward_zero_std": 0.5625, "grad_norm": 0.9080219268798828, "kl": 0.25586017966270447, "learning_rate": 4.98311750030826e-06, "loss": 0.0005, "num_tokens": 68360800.0, "reward": 0.8626302480697632, "reward_std": 0.12489809095859528, "rewards/reward_len/mean": 0.8626302480697632, "rewards/reward_len/std": 0.3376501500606537, "step": 348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11168, "frac_reward_zero_std": 0.75, "grad_norm": 0.6058202981948853, "kl": 0.2895188331604004, "learning_rate": 4.983020165082058e-06, "loss": 0.0006, "num_tokens": 68556992.0, "reward": 0.9361979365348816, "reward_std": 0.0770157128572464, "rewards/reward_len/mean": 0.9361979365348816, "rewards/reward_len/std": 0.21171681582927704, "step": 349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.112, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6941797137260437, "kl": 0.26969248056411743, "learning_rate": 4.982922551027179e-06, "loss": 0.0005, "num_tokens": 68753456.0, "reward": 0.9153646230697632, "reward_std": 0.0711875781416893, "rewards/reward_len/mean": 0.9153645634651184, "rewards/reward_len/std": 0.21414682269096375, "step": 350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11232, "frac_reward_zero_std": 0.8125, "grad_norm": 0.7781234383583069, "kl": 0.2964722216129303, "learning_rate": 4.982824658154589e-06, "loss": 0.0006, "num_tokens": 68949984.0, "reward": 0.9700521230697632, "reward_std": 0.034258898347616196, "rewards/reward_len/mean": 0.9700520634651184, "rewards/reward_len/std": 0.10933268815279007, "step": 351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11264, "frac_reward_zero_std": 0.875, "grad_norm": 0.36647942662239075, "kl": 0.20843760669231415, "learning_rate": 4.9827264864752764e-06, "loss": 0.0004, "num_tokens": 69146384.0, "reward": 0.9554036855697632, "reward_std": 0.01171875, "rewards/reward_len/mean": 0.9554036855697632, "rewards/reward_len/std": 0.1646680384874344, "step": 352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11296, "frac_reward_zero_std": 0.8125, "grad_norm": 0.4388026297092438, "kl": 0.23127871751785278, "learning_rate": 4.9826280360002685e-06, "loss": 0.0005, "num_tokens": 69342768.0, "reward": 0.8658854365348816, "reward_std": 0.046399228274822235, "rewards/reward_len/mean": 0.8658854365348816, "rewards/reward_len/std": 0.2726123332977295, "step": 353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11328, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5205654501914978, "kl": 0.244815394282341, "learning_rate": 4.982529306740619e-06, "loss": 0.0005, "num_tokens": 69539472.0, "reward": 0.9329427480697632, "reward_std": 0.07489942759275436, "rewards/reward_len/mean": 0.9329427480697632, "rewards/reward_len/std": 0.16717524826526642, "step": 354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1136, "frac_reward_zero_std": 0.5, "grad_norm": 0.8152531385421753, "kl": 0.266327828168869, "learning_rate": 4.982430298707414e-06, "loss": 0.0005, "num_tokens": 69735936.0, "reward": 0.890625, "reward_std": 0.10646995157003403, "rewards/reward_len/mean": 0.890625, "rewards/reward_len/std": 0.23132891952991486, "step": 355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11392, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6865121722221375, "kl": 0.2810913324356079, "learning_rate": 4.982331011911774e-06, "loss": 0.0006, "num_tokens": 69932656.0, "reward": 0.9345703125, "reward_std": 0.1171019971370697, "rewards/reward_len/mean": 0.9345703125, "rewards/reward_len/std": 0.1905432790517807, "step": 356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11424, "frac_reward_zero_std": 0.5625, "grad_norm": 0.70012366771698, "kl": 0.3025228977203369, "learning_rate": 4.982231446364846e-06, "loss": 0.0006, "num_tokens": 70129056.0, "reward": 0.8912760615348816, "reward_std": 0.0744968131184578, "rewards/reward_len/mean": 0.8912760019302368, "rewards/reward_len/std": 0.2168196141719818, "step": 357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11456, "frac_reward_zero_std": 0.75, "grad_norm": 0.4275182783603668, "kl": 0.2190450131893158, "learning_rate": 4.982131602077811e-06, "loss": 0.0004, "num_tokens": 70325472.0, "reward": 0.9117838740348816, "reward_std": 0.050354763865470886, "rewards/reward_len/mean": 0.9117838740348816, "rewards/reward_len/std": 0.2243437021970749, "step": 358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11488, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5386918783187866, "kl": 0.2650001347064972, "learning_rate": 4.982031479061882e-06, "loss": 0.0005, "num_tokens": 70521904.0, "reward": 0.916015625, "reward_std": 0.0648043155670166, "rewards/reward_len/mean": 0.916015625, "rewards/reward_len/std": 0.18331731855869293, "step": 359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1152, "frac_reward_zero_std": 0.625, "grad_norm": 0.5448402166366577, "kl": 0.2232052981853485, "learning_rate": 4.981931077328301e-06, "loss": 0.0004, "num_tokens": 70718144.0, "reward": 0.9534505605697632, "reward_std": 0.07850876450538635, "rewards/reward_len/mean": 0.9534505009651184, "rewards/reward_len/std": 0.17535531520843506, "step": 360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11552, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5613270998001099, "kl": 0.23101890087127686, "learning_rate": 4.9818303968883445e-06, "loss": 0.0005, "num_tokens": 70914672.0, "reward": 0.9820963740348816, "reward_std": 0.0487230122089386, "rewards/reward_len/mean": 0.9820963740348816, "rewards/reward_len/std": 0.09116523712873459, "step": 361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11584, "frac_reward_zero_std": 0.5, "grad_norm": 0.6078641414642334, "kl": 0.27006006240844727, "learning_rate": 4.9817294377533156e-06, "loss": 0.0005, "num_tokens": 71111184.0, "reward": 0.8723958730697632, "reward_std": 0.12105487287044525, "rewards/reward_len/mean": 0.8723958134651184, "rewards/reward_len/std": 0.29082512855529785, "step": 362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11616, "frac_reward_zero_std": 0.625, "grad_norm": 0.5539776086807251, "kl": 0.3801600933074951, "learning_rate": 4.981628199934553e-06, "loss": 0.0008, "num_tokens": 71307504.0, "reward": 0.880859375, "reward_std": 0.09567587822675705, "rewards/reward_len/mean": 0.880859375, "rewards/reward_len/std": 0.2817397713661194, "step": 363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11648, "frac_reward_zero_std": 0.5, "grad_norm": 0.7021991610527039, "kl": 0.292778879404068, "learning_rate": 4.981526683443425e-06, "loss": 0.0006, "num_tokens": 71503904.0, "reward": 0.9638671875, "reward_std": 0.09187515079975128, "rewards/reward_len/mean": 0.9638671875, "rewards/reward_len/std": 0.14011269807815552, "step": 364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1168, "frac_reward_zero_std": 0.8125, "grad_norm": 0.35832977294921875, "kl": 0.26917755603790283, "learning_rate": 4.981424888291331e-06, "loss": 0.0005, "num_tokens": 71700192.0, "reward": 0.9296875, "reward_std": 0.038367513567209244, "rewards/reward_len/mean": 0.9296875, "rewards/reward_len/std": 0.24037858843803406, "step": 365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11712, "frac_reward_zero_std": 0.625, "grad_norm": 0.606704592704773, "kl": 0.2380182147026062, "learning_rate": 4.981322814489703e-06, "loss": 0.0005, "num_tokens": 71896672.0, "reward": 0.93359375, "reward_std": 0.08497709780931473, "rewards/reward_len/mean": 0.93359375, "rewards/reward_len/std": 0.1882609724998474, "step": 366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11744, "frac_reward_zero_std": 0.625, "grad_norm": 0.6150651574134827, "kl": 0.24555958807468414, "learning_rate": 4.981220462050002e-06, "loss": 0.0005, "num_tokens": 72093232.0, "reward": 0.9010416865348816, "reward_std": 0.07711885869503021, "rewards/reward_len/mean": 0.9010416269302368, "rewards/reward_len/std": 0.22146521508693695, "step": 367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11776, "frac_reward_zero_std": 0.625, "grad_norm": 0.642767071723938, "kl": 0.26058658957481384, "learning_rate": 4.981117830983721e-06, "loss": 0.0005, "num_tokens": 72289632.0, "reward": 0.9000651240348816, "reward_std": 0.09616972506046295, "rewards/reward_len/mean": 0.9000651240348816, "rewards/reward_len/std": 0.2878645658493042, "step": 368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11808, "frac_reward_zero_std": 0.625, "grad_norm": 0.6176137328147888, "kl": 0.2558377981185913, "learning_rate": 4.981014921302387e-06, "loss": 0.0005, "num_tokens": 72485968.0, "reward": 0.9765625, "reward_std": 0.07275305688381195, "rewards/reward_len/mean": 0.9765625, "rewards/reward_len/std": 0.12168814986944199, "step": 369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1184, "frac_reward_zero_std": 0.75, "grad_norm": 0.654221773147583, "kl": 0.2902168929576874, "learning_rate": 4.980911733017556e-06, "loss": 0.0006, "num_tokens": 72682368.0, "reward": 0.8170573115348816, "reward_std": 0.04243423789739609, "rewards/reward_len/mean": 0.8170572519302368, "rewards/reward_len/std": 0.36997300386428833, "step": 370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11872, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6374700665473938, "kl": 0.3265115022659302, "learning_rate": 4.980808266140813e-06, "loss": 0.0007, "num_tokens": 72878720.0, "reward": 0.9375, "reward_std": 0.0782892256975174, "rewards/reward_len/mean": 0.9375, "rewards/reward_len/std": 0.20130294561386108, "step": 371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11904, "frac_reward_zero_std": 0.625, "grad_norm": 0.6965134143829346, "kl": 0.3249279260635376, "learning_rate": 4.980704520683779e-06, "loss": 0.0006, "num_tokens": 73074992.0, "reward": 0.9049479365348816, "reward_std": 0.07779471576213837, "rewards/reward_len/mean": 0.9049478769302368, "rewards/reward_len/std": 0.27618518471717834, "step": 372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11936, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6993541717529297, "kl": 0.29671189188957214, "learning_rate": 4.980600496658103e-06, "loss": 0.0006, "num_tokens": 73271680.0, "reward": 0.9449869990348816, "reward_std": 0.1163741946220398, "rewards/reward_len/mean": 0.9449869394302368, "rewards/reward_len/std": 0.18133443593978882, "step": 373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.11968, "frac_reward_zero_std": 0.375, "grad_norm": 0.7951380610466003, "kl": 0.22916045784950256, "learning_rate": 4.980496194075467e-06, "loss": 0.0005, "num_tokens": 73468224.0, "reward": 0.9059244990348816, "reward_std": 0.12064296007156372, "rewards/reward_len/mean": 0.9059244990348816, "rewards/reward_len/std": 0.24757792055606842, "step": 374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12, "frac_reward_zero_std": 0.5, "grad_norm": 0.9399390816688538, "kl": 0.24769377708435059, "learning_rate": 4.980391612947583e-06, "loss": 0.0005, "num_tokens": 73664528.0, "reward": 0.875, "reward_std": 0.08645164221525192, "rewards/reward_len/mean": 0.875, "rewards/reward_len/std": 0.30731815099716187, "step": 375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12032, "frac_reward_zero_std": 0.5, "grad_norm": 0.5999814867973328, "kl": 0.32891643047332764, "learning_rate": 4.980286753286196e-06, "loss": 0.0007, "num_tokens": 73860960.0, "reward": 0.9065755605697632, "reward_std": 0.12265858054161072, "rewards/reward_len/mean": 0.9065755605697632, "rewards/reward_len/std": 0.24035003781318665, "step": 376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12064, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5528712272644043, "kl": 0.25928014516830444, "learning_rate": 4.980181615103078e-06, "loss": 0.0005, "num_tokens": 74057440.0, "reward": 0.9498698115348816, "reward_std": 0.09822811186313629, "rewards/reward_len/mean": 0.9498697519302368, "rewards/reward_len/std": 0.19990189373493195, "step": 377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12096, "frac_reward_zero_std": 0.75, "grad_norm": 0.6117814779281616, "kl": 0.3367432653903961, "learning_rate": 4.980076198410039e-06, "loss": 0.0007, "num_tokens": 74254000.0, "reward": 0.9235026240348816, "reward_std": 0.06725481897592545, "rewards/reward_len/mean": 0.9235026240348816, "rewards/reward_len/std": 0.23921607434749603, "step": 378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12128, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7215777039527893, "kl": 0.2727699875831604, "learning_rate": 4.979970503218914e-06, "loss": 0.0005, "num_tokens": 74450272.0, "reward": 0.775390625, "reward_std": 0.08670918643474579, "rewards/reward_len/mean": 0.775390625, "rewards/reward_len/std": 0.3441953659057617, "step": 379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1216, "frac_reward_zero_std": 0.75, "grad_norm": 0.9631336331367493, "kl": 0.2849399447441101, "learning_rate": 4.979864529541574e-06, "loss": 0.0006, "num_tokens": 74646656.0, "reward": 0.9423828125, "reward_std": 0.03196436166763306, "rewards/reward_len/mean": 0.9423828125, "rewards/reward_len/std": 0.15362548828125, "step": 380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12192, "frac_reward_zero_std": 0.75, "grad_norm": 0.5980784296989441, "kl": 0.2634403109550476, "learning_rate": 4.979758277389919e-06, "loss": 0.0005, "num_tokens": 74843088.0, "reward": 0.91015625, "reward_std": 0.051367681473493576, "rewards/reward_len/mean": 0.91015625, "rewards/reward_len/std": 0.19860883057117462, "step": 381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12224, "frac_reward_zero_std": 0.5, "grad_norm": 0.7094947695732117, "kl": 0.28850382566452026, "learning_rate": 4.979651746775879e-06, "loss": 0.0006, "num_tokens": 75039616.0, "reward": 0.8753255605697632, "reward_std": 0.08563503623008728, "rewards/reward_len/mean": 0.8753255605697632, "rewards/reward_len/std": 0.2784956097602844, "step": 382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12256, "frac_reward_zero_std": 0.625, "grad_norm": 0.6723067164421082, "kl": 0.26455748081207275, "learning_rate": 4.979544937711417e-06, "loss": 0.0005, "num_tokens": 75236128.0, "reward": 0.9108073115348816, "reward_std": 0.09380312263965607, "rewards/reward_len/mean": 0.9108072519302368, "rewards/reward_len/std": 0.256013423204422, "step": 383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12288, "frac_reward_zero_std": 0.6875, "grad_norm": 0.5278858542442322, "kl": 0.3352203369140625, "learning_rate": 4.9794378502085285e-06, "loss": 0.0007, "num_tokens": 75432432.0, "reward": 0.8863932490348816, "reward_std": 0.09150020778179169, "rewards/reward_len/mean": 0.8863931894302368, "rewards/reward_len/std": 0.3099307417869568, "step": 384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1232, "frac_reward_zero_std": 0.625, "grad_norm": 0.5988962054252625, "kl": 0.24847528338432312, "learning_rate": 4.979330484279239e-06, "loss": 0.0005, "num_tokens": 75628960.0, "reward": 0.9261068105697632, "reward_std": 0.09941554069519043, "rewards/reward_len/mean": 0.9261068105697632, "rewards/reward_len/std": 0.223832905292511, "step": 385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12352, "frac_reward_zero_std": 0.4375, "grad_norm": 0.8019775152206421, "kl": 0.3128344416618347, "learning_rate": 4.979222839935602e-06, "loss": 0.0006, "num_tokens": 75825424.0, "reward": 0.8984375, "reward_std": 0.10939542949199677, "rewards/reward_len/mean": 0.8984375, "rewards/reward_len/std": 0.23522517085075378, "step": 386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12384, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6479683518409729, "kl": 0.3821362853050232, "learning_rate": 4.979114917189709e-06, "loss": 0.0008, "num_tokens": 76021712.0, "reward": 0.8291015625, "reward_std": 0.12574782967567444, "rewards/reward_len/mean": 0.8291015625, "rewards/reward_len/std": 0.30617907643318176, "step": 387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12416, "frac_reward_zero_std": 0.625, "grad_norm": 0.6782759428024292, "kl": 0.28288230299949646, "learning_rate": 4.979006716053677e-06, "loss": 0.0006, "num_tokens": 76218032.0, "reward": 0.9225260615348816, "reward_std": 0.08340580761432648, "rewards/reward_len/mean": 0.9225260615348816, "rewards/reward_len/std": 0.22389492392539978, "step": 388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12448, "frac_reward_zero_std": 0.625, "grad_norm": 0.7160609364509583, "kl": 0.270611435174942, "learning_rate": 4.978898236539656e-06, "loss": 0.0005, "num_tokens": 76414656.0, "reward": 0.9368489980697632, "reward_std": 0.06814989447593689, "rewards/reward_len/mean": 0.9368489384651184, "rewards/reward_len/std": 0.167886421084404, "step": 389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1248, "frac_reward_zero_std": 0.5625, "grad_norm": 0.8049089908599854, "kl": 0.2709094285964966, "learning_rate": 4.97878947865983e-06, "loss": 0.0005, "num_tokens": 76611088.0, "reward": 0.9244791865348816, "reward_std": 0.07185834646224976, "rewards/reward_len/mean": 0.9244791865348816, "rewards/reward_len/std": 0.1879940927028656, "step": 390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12512, "frac_reward_zero_std": 0.5625, "grad_norm": 0.5765368342399597, "kl": 0.2744341492652893, "learning_rate": 4.978680442426409e-06, "loss": 0.0005, "num_tokens": 76807360.0, "reward": 0.8821614980697632, "reward_std": 0.09969843924045563, "rewards/reward_len/mean": 0.8821614384651184, "rewards/reward_len/std": 0.3090021014213562, "step": 391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12544, "frac_reward_zero_std": 0.5625, "grad_norm": 0.7450719475746155, "kl": 0.27529090642929077, "learning_rate": 4.978571127851639e-06, "loss": 0.0006, "num_tokens": 77003856.0, "reward": 0.900390625, "reward_std": 0.10165678709745407, "rewards/reward_len/mean": 0.900390625, "rewards/reward_len/std": 0.21391122043132782, "step": 392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12576, "frac_reward_zero_std": 0.5625, "grad_norm": 0.6389956474304199, "kl": 0.28474485874176025, "learning_rate": 4.978461534947795e-06, "loss": 0.0006, "num_tokens": 77200272.0, "reward": 0.9619140625, "reward_std": 0.09809305518865585, "rewards/reward_len/mean": 0.9619140625, "rewards/reward_len/std": 0.16097532212734222, "step": 393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12608, "frac_reward_zero_std": 0.4375, "grad_norm": 0.6966572999954224, "kl": 0.30976608395576477, "learning_rate": 4.978351663727183e-06, "loss": 0.0006, "num_tokens": 77396816.0, "reward": 0.9010416269302368, "reward_std": 0.11276385188102722, "rewards/reward_len/mean": 0.9010416269302368, "rewards/reward_len/std": 0.2284860610961914, "step": 394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.1264, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5235814452171326, "kl": 0.3274773359298706, "learning_rate": 4.978241514202142e-06, "loss": 0.0007, "num_tokens": 77593312.0, "reward": 0.9827474355697632, "reward_std": 0.038435421884059906, "rewards/reward_len/mean": 0.9827473759651184, "rewards/reward_len/std": 0.09763368219137192, "step": 395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12672, "frac_reward_zero_std": 0.75, "grad_norm": 0.9163589477539062, "kl": 0.3441965579986572, "learning_rate": 4.978131086385041e-06, "loss": 0.0007, "num_tokens": 77789824.0, "reward": 0.9000651240348816, "reward_std": 0.05585475265979767, "rewards/reward_len/mean": 0.9000651240348816, "rewards/reward_len/std": 0.27280718088150024, "step": 396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12704, "frac_reward_zero_std": 0.75, "grad_norm": 0.3683468699455261, "kl": 0.26881569623947144, "learning_rate": 4.978020380288279e-06, "loss": 0.0005, "num_tokens": 77986048.0, "reward": 0.8678385615348816, "reward_std": 0.04182879626750946, "rewards/reward_len/mean": 0.8678385615348816, "rewards/reward_len/std": 0.32839933037757874, "step": 397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12736, "frac_reward_zero_std": 0.5, "grad_norm": 0.6271927356719971, "kl": 0.2520839273929596, "learning_rate": 4.977909395924289e-06, "loss": 0.0005, "num_tokens": 78182448.0, "reward": 0.9248046875, "reward_std": 0.10864973068237305, "rewards/reward_len/mean": 0.9248046875, "rewards/reward_len/std": 0.20295734703540802, "step": 398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.12768, "frac_reward_zero_std": 0.6875, "grad_norm": 0.6000593900680542, "kl": 0.2679736912250519, "learning_rate": 4.977798133305532e-06, "loss": 0.0005, "num_tokens": 78379072.0, "reward": 0.9420573115348816, "reward_std": 0.05941763147711754, "rewards/reward_len/mean": 0.9420573115348816, "rewards/reward_len/std": 0.18510982394218445, "step": 399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 729.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 729.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 729.0, "completions/min_terminated_length": 0.0, "epoch": 0.128, "frac_reward_zero_std": 0.75, "grad_norm": 0.5058302879333496, "kl": 0.24807330965995789, "learning_rate": 4.977686592444505e-06, "loss": 0.0005, "num_tokens": 78575616.0, "reward": 0.9625651240348816, "reward_std": 0.04426242411136627, "rewards/reward_len/mean": 0.9625651240348816, "rewards/reward_len/std": 0.12408048659563065, "step": 400 } ], "logging_steps": 1, "max_steps": 9375, "num_input_tokens_seen": 78575616, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }