{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.959501557632399, "eval_steps": 500, "global_step": 1900, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 1512.65478515625, "completions/mean_terminated_length": 1512.65478515625, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "epoch": 0.001557632398753894, "grad_norm": 0.602738082408905, "kl": -8.884206703640984e-10, "learning_rate": 0.0, "loss": 0.02, "num_tokens": 133045.0, "reward": 1.3617119789123535, "reward_std": 0.09446237236261368, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.36171185970306396, "rewards/correct_reward_func/std": 0.15946270525455475, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 1677.65478515625, "completions/mean_terminated_length": 1518.7681884765625, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.003115264797507788, "grad_norm": 0.5372695922851562, "kl": -8.036803722522023e-10, "learning_rate": 2e-07, "loss": 0.0986, "num_tokens": 279938.0, "reward": 1.3327711820602417, "reward_std": 0.11337035149335861, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3327711820602417, "rewards/correct_reward_func/std": 0.14508673548698425, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1559.9881591796875, "completions/mean_terminated_length": 1480.084228515625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.004672897196261682, "grad_norm": 0.5770987868309021, "kl": 0.0008140590216498822, "learning_rate": 4e-07, "loss": 0.0348, "num_tokens": 417181.0, "reward": 1.3511351346969604, "reward_std": 0.12009123712778091, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.35113492608070374, "rewards/correct_reward_func/std": 0.16792196035385132, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 1605.8929443359375, "completions/mean_terminated_length": 1605.8929443359375, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.006230529595015576, "grad_norm": 0.5473058223724365, "kl": 0.0007717256958130747, "learning_rate": 6e-07, "loss": 0.0022, "num_tokens": 557962.0, "reward": 1.3706098794937134, "reward_std": 0.13414135575294495, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.37060973048210144, "rewards/correct_reward_func/std": 0.1871974617242813, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 1513.59521484375, "completions/mean_terminated_length": 1433.1324462890625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.00778816199376947, "grad_norm": 0.5818154811859131, "kl": 0.0007767349597997963, "learning_rate": 8e-07, "loss": 0.0841, "num_tokens": 690954.0, "reward": 1.2946425676345825, "reward_std": 0.20980872213840485, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.342261403799057, "rewards/correct_reward_func/std": 0.15122981369495392, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 1510.3214111328125, "completions/mean_terminated_length": 1510.3214111328125, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 0.009345794392523364, "grad_norm": 0.5755687355995178, "kl": 0.0008145314350258559, "learning_rate": 1e-06, "loss": 0.0469, "num_tokens": 823905.0, "reward": 1.4106093645095825, "reward_std": 0.12289178371429443, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42251405119895935, "rewards/correct_reward_func/std": 0.1738594025373459, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 1465.047607421875, "completions/mean_terminated_length": 1465.047607421875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.010903426791277258, "grad_norm": 0.6340458393096924, "kl": 0.0008423295512329787, "learning_rate": 1.2e-06, "loss": -0.0223, "num_tokens": 952801.0, "reward": 1.3219488859176636, "reward_std": 0.14029237627983093, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.3338535726070404, "rewards/correct_reward_func/std": 0.13986904919147491, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2268.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 1477.011962890625, "completions/mean_terminated_length": 1477.011962890625, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.012461059190031152, "grad_norm": 0.5733258128166199, "kl": 0.0008218956645578146, "learning_rate": 1.4e-06, "loss": -0.0078, "num_tokens": 1082942.0, "reward": 1.3452098369598389, "reward_std": 0.09600555151700974, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3452097773551941, "rewards/correct_reward_func/std": 0.13662667572498322, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 1486.7261962890625, "completions/mean_terminated_length": 1486.7261962890625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.014018691588785047, "grad_norm": 0.5680725574493408, "kl": 0.0008863781113177538, "learning_rate": 1.6e-06, "loss": -0.0153, "num_tokens": 1213953.0, "reward": 1.3956345319747925, "reward_std": 0.13600240647792816, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.39563441276550293, "rewards/correct_reward_func/std": 0.183233380317688, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1473.6429443359375, "completions/mean_terminated_length": 1473.6429443359375, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.01557632398753894, "grad_norm": 0.5650473833084106, "kl": 0.0009199154155794531, "learning_rate": 1.8e-06, "loss": 0.0136, "num_tokens": 1343673.0, "reward": 1.3652774095535278, "reward_std": 0.0817384421825409, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.36527732014656067, "rewards/correct_reward_func/std": 0.14138561487197876, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2230.0, "completions/mean_length": 1687.2857666015625, "completions/mean_terminated_length": 1528.634033203125, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 0.017133956386292833, "grad_norm": 0.5159066915512085, "kl": 0.0009192087745759636, "learning_rate": 2e-06, "loss": 0.0883, "num_tokens": 1491351.0, "reward": 1.4014300107955933, "reward_std": 0.18983161449432373, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.44904908537864685, "rewards/correct_reward_func/std": 0.16164183616638184, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 1522.107177734375, "completions/mean_terminated_length": 1522.107177734375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.018691588785046728, "grad_norm": 0.5924640893936157, "kl": 0.0011004244443029165, "learning_rate": 1.999375e-06, "loss": -0.0201, "num_tokens": 1625304.0, "reward": 1.3764175176620483, "reward_std": 0.11236605048179626, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.388322114944458, "rewards/correct_reward_func/std": 0.14457714557647705, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 1624.9285888671875, "completions/mean_terminated_length": 1464.756103515625, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.020249221183800622, "grad_norm": 0.5270693302154541, "kl": 0.0012032188242301345, "learning_rate": 1.99875e-06, "loss": 0.0802, "num_tokens": 1767936.0, "reward": 1.3871831893920898, "reward_std": 0.13983462750911713, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.3990878164768219, "rewards/correct_reward_func/std": 0.1606336236000061, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 1479.9761962890625, "completions/mean_terminated_length": 1479.9761962890625, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.021806853582554516, "grad_norm": 0.6190818548202515, "kl": 0.0014357012696564198, "learning_rate": 1.998125e-06, "loss": -0.0216, "num_tokens": 1898164.0, "reward": 1.3895180225372314, "reward_std": 0.08238209784030914, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.38951802253723145, "rewards/correct_reward_func/std": 0.11832733452320099, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 1475.0, "completions/mean_terminated_length": 1394.072265625, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.02336448598130841, "grad_norm": 0.6059337258338928, "kl": 0.0016431952244602144, "learning_rate": 1.9975e-06, "loss": 0.0526, "num_tokens": 2027914.0, "reward": 1.3913025856018066, "reward_std": 0.1798313409090042, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.4270167648792267, "rewards/correct_reward_func/std": 0.15607501566410065, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 1430.6785888671875, "completions/mean_terminated_length": 1430.6785888671875, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 0.024922118380062305, "grad_norm": 0.5683785676956177, "kl": 0.0018854692461900413, "learning_rate": 1.996875e-06, "loss": 0.0171, "num_tokens": 2153959.0, "reward": 1.3746126890182495, "reward_std": 0.11688078194856644, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3746126592159271, "rewards/correct_reward_func/std": 0.16250161826610565, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2076.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 1465.607177734375, "completions/mean_terminated_length": 1465.607177734375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.0264797507788162, "grad_norm": 0.5914926528930664, "kl": 0.002100524492561817, "learning_rate": 1.99625e-06, "loss": 0.0093, "num_tokens": 2283034.0, "reward": 1.3505299091339111, "reward_std": 0.10699693858623505, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.35052984952926636, "rewards/correct_reward_func/std": 0.13587050139904022, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2083.0, "completions/max_terminated_length": 2083.0, "completions/mean_length": 1521.297607421875, "completions/mean_terminated_length": 1521.297607421875, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "epoch": 0.028037383177570093, "grad_norm": 0.5633271336555481, "kl": 0.002299150452017784, "learning_rate": 1.995625e-06, "loss": 0.0163, "num_tokens": 2416793.0, "reward": 1.3485947847366333, "reward_std": 0.12012340128421783, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3485947251319885, "rewards/correct_reward_func/std": 0.15519553422927856, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2358.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 1465.702392578125, "completions/mean_terminated_length": 1465.702392578125, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.029595015576323987, "grad_norm": 0.5992788076400757, "kl": 0.002645128988660872, "learning_rate": 1.995e-06, "loss": 0.0093, "num_tokens": 2545768.0, "reward": 1.4050683975219727, "reward_std": 0.09077386558055878, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4050683081150055, "rewards/correct_reward_func/std": 0.1320529729127884, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 1532.8929443359375, "completions/mean_terminated_length": 1532.8929443359375, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 0.03115264797507788, "grad_norm": 0.5546616315841675, "kl": 0.00307619187515229, "learning_rate": 1.994375e-06, "loss": 0.0061, "num_tokens": 2680495.0, "reward": 1.4120489358901978, "reward_std": 0.0814485251903534, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41204896569252014, "rewards/correct_reward_func/std": 0.14482632279396057, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 1707.9285888671875, "completions/mean_terminated_length": 1549.7803955078125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.03271028037383177, "grad_norm": 0.588342010974884, "kl": 0.0031734263757243752, "learning_rate": 1.9937499999999998e-06, "loss": 0.0697, "num_tokens": 2830129.0, "reward": 1.336440920829773, "reward_std": 0.10719747841358185, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3364408016204834, "rewards/correct_reward_func/std": 0.1317695528268814, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2244.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 1420.0238037109375, "completions/mean_terminated_length": 1420.0238037109375, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 0.03426791277258567, "grad_norm": 0.5817105770111084, "kl": 0.003916586167179048, "learning_rate": 1.993125e-06, "loss": -0.0456, "num_tokens": 2955267.0, "reward": 1.4114601612091064, "reward_std": 0.1481117159128189, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.43526971340179443, "rewards/correct_reward_func/std": 0.13317571580410004, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2074.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 1391.2738037109375, "completions/mean_terminated_length": 1391.2738037109375, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 0.03582554517133956, "grad_norm": 0.6308074593544006, "kl": 0.004312207689508796, "learning_rate": 1.9925e-06, "loss": -0.0213, "num_tokens": 3078014.0, "reward": 1.37629234790802, "reward_std": 0.16423028707504272, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4001017212867737, "rewards/correct_reward_func/std": 0.18709857761859894, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2132.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 1513.857177734375, "completions/mean_terminated_length": 1513.857177734375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 0.037383177570093455, "grad_norm": 0.5661871433258057, "kl": 0.004699907032772899, "learning_rate": 1.991875e-06, "loss": -0.0204, "num_tokens": 3211100.0, "reward": 1.384656310081482, "reward_std": 0.06906478852033615, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.38465628027915955, "rewards/correct_reward_func/std": 0.13333608210086823, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 1553.4761962890625, "completions/mean_terminated_length": 1553.4761962890625, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.03894080996884735, "grad_norm": 0.5596578121185303, "kl": 0.0051078200340271, "learning_rate": 1.9912499999999998e-06, "loss": -0.0025, "num_tokens": 3347538.0, "reward": 1.4165700674057007, "reward_std": 0.0908581092953682, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4165700376033783, "rewards/correct_reward_func/std": 0.11516361683607101, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 1607.5357666015625, "completions/mean_terminated_length": 1528.2047119140625, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.040498442367601244, "grad_norm": 0.5141485929489136, "kl": 0.005483957007527351, "learning_rate": 1.990625e-06, "loss": 0.0479, "num_tokens": 3488835.0, "reward": 1.3931559324264526, "reward_std": 0.09568320959806442, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3931559920310974, "rewards/correct_reward_func/std": 0.15160411596298218, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 1509.1429443359375, "completions/mean_terminated_length": 1509.1429443359375, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.04205607476635514, "grad_norm": 0.5826243758201599, "kl": 0.006127089960500598, "learning_rate": 1.99e-06, "loss": 0.0232, "num_tokens": 3621663.0, "reward": 1.3912872076034546, "reward_std": 0.09357985109090805, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3912872076034546, "rewards/correct_reward_func/std": 0.12481305748224258, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2237.0, "completions/max_terminated_length": 2237.0, "completions/mean_length": 1489.3214111328125, "completions/mean_terminated_length": 1489.3214111328125, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.04361370716510903, "grad_norm": 0.5792966485023499, "kl": 0.006385253742337227, "learning_rate": 1.989375e-06, "loss": -0.0165, "num_tokens": 3752922.0, "reward": 1.353344440460205, "reward_std": 0.1214955672621727, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.3771539032459259, "rewards/correct_reward_func/std": 0.1341404765844345, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2873.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 1522.3095703125, "completions/mean_terminated_length": 1522.3095703125, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 0.045171339563862926, "grad_norm": 0.566626250743866, "kl": 0.0068822442553937435, "learning_rate": 1.98875e-06, "loss": 0.0097, "num_tokens": 3886802.0, "reward": 1.4724082946777344, "reward_std": 0.11441156268119812, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4843129515647888, "rewards/correct_reward_func/std": 0.1651531606912613, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1525.607177734375, "completions/mean_terminated_length": 1525.607177734375, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 0.04672897196261682, "grad_norm": 0.5457088351249695, "kl": 0.0072290110401809216, "learning_rate": 1.9881249999999997e-06, "loss": -0.0188, "num_tokens": 4021133.0, "reward": 1.4517107009887695, "reward_std": 0.07869784533977509, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4517105519771576, "rewards/correct_reward_func/std": 0.1555166095495224, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 1493.107177734375, "completions/mean_terminated_length": 1493.107177734375, "completions/min_length": 962.0, "completions/min_terminated_length": 962.0, "epoch": 0.048286604361370715, "grad_norm": 0.6338136792182922, "kl": 0.00765396817587316, "learning_rate": 1.9875e-06, "loss": -0.0171, "num_tokens": 4152536.0, "reward": 1.4097148180007935, "reward_std": 0.06910388171672821, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.409714937210083, "rewards/correct_reward_func/std": 0.15830738842487335, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 1465.21435546875, "completions/mean_terminated_length": 1465.21435546875, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 0.04984423676012461, "grad_norm": 0.5526667833328247, "kl": 0.008096857462078333, "learning_rate": 1.986875e-06, "loss": -0.0181, "num_tokens": 4281500.0, "reward": 1.4042376279830933, "reward_std": 0.13532030582427979, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4280470907688141, "rewards/correct_reward_func/std": 0.1270482838153839, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2617.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 1565.357177734375, "completions/mean_terminated_length": 1565.357177734375, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.0514018691588785, "grad_norm": 0.5518661737442017, "kl": 0.008292545564472675, "learning_rate": 1.98625e-06, "loss": -0.0018, "num_tokens": 4419188.0, "reward": 1.4700257778167725, "reward_std": 0.07613833993673325, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4700256884098053, "rewards/correct_reward_func/std": 0.1587969958782196, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2546.0, "completions/max_terminated_length": 2546.0, "completions/mean_length": 1508.3333740234375, "completions/mean_terminated_length": 1508.3333740234375, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.0529595015576324, "grad_norm": 0.5799688696861267, "kl": 0.008962879423052073, "learning_rate": 1.9856249999999997e-06, "loss": 0.0352, "num_tokens": 4551942.0, "reward": 1.3886348009109497, "reward_std": 0.0952007845044136, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4005395472049713, "rewards/correct_reward_func/std": 0.13069912791252136, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 1659.6429443359375, "completions/mean_terminated_length": 1580.939697265625, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.05451713395638629, "grad_norm": 0.5250554084777832, "kl": 0.008917136583477259, "learning_rate": 1.985e-06, "loss": 0.0202, "num_tokens": 4697496.0, "reward": 1.4027281999588013, "reward_std": 0.1092085987329483, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.41463297605514526, "rewards/correct_reward_func/std": 0.15449127554893494, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 1497.416748046875, "completions/mean_terminated_length": 1497.416748046875, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 0.056074766355140186, "grad_norm": 0.5780891180038452, "kl": 0.009273746516555548, "learning_rate": 1.984375e-06, "loss": -0.0039, "num_tokens": 4829153.0, "reward": 1.447800636291504, "reward_std": 0.09562971442937851, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44780054688453674, "rewards/correct_reward_func/std": 0.14727683365345, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 1602.2261962890625, "completions/mean_terminated_length": 1522.831298828125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.05763239875389408, "grad_norm": 0.544447660446167, "kl": 0.009292236994951963, "learning_rate": 1.98375e-06, "loss": 0.0476, "num_tokens": 4969716.0, "reward": 1.3862570524215698, "reward_std": 0.13318565487861633, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.39816194772720337, "rewards/correct_reward_func/std": 0.13715338706970215, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1596.1429443359375, "completions/mean_terminated_length": 1596.1429443359375, "completions/min_length": 1108.0, "completions/min_terminated_length": 1108.0, "epoch": 0.059190031152647975, "grad_norm": 0.5399186015129089, "kl": 0.009696871042251587, "learning_rate": 1.9831249999999998e-06, "loss": 0.0039, "num_tokens": 5109858.0, "reward": 1.450761318206787, "reward_std": 0.07588593661785126, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4507613480091095, "rewards/correct_reward_func/std": 0.14506648480892181, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 1563.5833740234375, "completions/mean_terminated_length": 1563.5833740234375, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 0.06074766355140187, "grad_norm": 0.5695660710334778, "kl": 0.01048774877563119, "learning_rate": 1.9824999999999997e-06, "loss": -0.0184, "num_tokens": 5247163.0, "reward": 1.468957781791687, "reward_std": 0.11620029807090759, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48086267709732056, "rewards/correct_reward_func/std": 0.15909142792224884, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 1592.40478515625, "completions/mean_terminated_length": 1592.40478515625, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.06230529595015576, "grad_norm": 0.5112860798835754, "kl": 0.01074655307456851, "learning_rate": 1.981875e-06, "loss": 0.0015, "num_tokens": 5387021.0, "reward": 1.4456558227539062, "reward_std": 0.11054416000843048, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45756059885025024, "rewards/correct_reward_func/std": 0.1503782570362091, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1634.6190185546875, "completions/mean_terminated_length": 1555.6143798828125, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.06386292834890965, "grad_norm": 0.5349351167678833, "kl": 0.010826343204826117, "learning_rate": 1.98125e-06, "loss": 0.0725, "num_tokens": 5530521.0, "reward": 1.4252383708953857, "reward_std": 0.11318045854568481, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43714308738708496, "rewards/correct_reward_func/std": 0.1361854374408722, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 1527.047607421875, "completions/mean_terminated_length": 1527.047607421875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.06542056074766354, "grad_norm": 0.582492470741272, "kl": 0.010992726311087608, "learning_rate": 1.980625e-06, "loss": -0.0331, "num_tokens": 5664793.0, "reward": 1.4851031303405762, "reward_std": 0.1116347685456276, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4851030707359314, "rewards/correct_reward_func/std": 0.18202589452266693, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 1511.59521484375, "completions/mean_terminated_length": 1511.59521484375, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.06697819314641744, "grad_norm": 0.5876683592796326, "kl": 0.011393898166716099, "learning_rate": 1.98e-06, "loss": 0.0116, "num_tokens": 5797743.0, "reward": 1.3936251401901245, "reward_std": 0.08762513846158981, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4055299162864685, "rewards/correct_reward_func/std": 0.15156783163547516, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 1573.21435546875, "completions/mean_terminated_length": 1573.21435546875, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 0.06853582554517133, "grad_norm": 0.586552619934082, "kl": 0.012315568514168262, "learning_rate": 1.979375e-06, "loss": 0.0067, "num_tokens": 5935767.0, "reward": 1.3731769323349, "reward_std": 0.09234315901994705, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.3850816488265991, "rewards/correct_reward_func/std": 0.1175212487578392, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 1540.416748046875, "completions/mean_terminated_length": 1540.416748046875, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "epoch": 0.07009345794392523, "grad_norm": 0.5170222520828247, "kl": 0.012232929933816195, "learning_rate": 1.97875e-06, "loss": -0.0046, "num_tokens": 6071102.0, "reward": 1.4366557598114014, "reward_std": 0.05740538239479065, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4366556704044342, "rewards/correct_reward_func/std": 0.12168268114328384, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 1525.0, "completions/mean_terminated_length": 1525.0, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "epoch": 0.07165109034267912, "grad_norm": 0.5328289270401001, "kl": 0.01250599604099989, "learning_rate": 1.978125e-06, "loss": 0.001, "num_tokens": 6205328.0, "reward": 1.4486056566238403, "reward_std": 0.08073987811803818, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4605104327201843, "rewards/correct_reward_func/std": 0.1921333372592926, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2231.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 1511.3095703125, "completions/mean_terminated_length": 1511.3095703125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.07320872274143302, "grad_norm": 0.5911400318145752, "kl": 0.012701177038252354, "learning_rate": 1.9775e-06, "loss": -0.0073, "num_tokens": 6338398.0, "reward": 1.4118802547454834, "reward_std": 0.09744904190301895, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42378488183021545, "rewards/correct_reward_func/std": 0.15915460884571075, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 1445.1785888671875, "completions/mean_terminated_length": 1445.1785888671875, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.07476635514018691, "grad_norm": 0.5825939774513245, "kl": 0.014202028047293425, "learning_rate": 1.976875e-06, "loss": 0.0291, "num_tokens": 6465733.0, "reward": 1.4184983968734741, "reward_std": 0.07337880879640579, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41849830746650696, "rewards/correct_reward_func/std": 0.12052969634532928, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1505.0595703125, "completions/mean_terminated_length": 1505.0595703125, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.0763239875389408, "grad_norm": 0.5948581099510193, "kl": 0.013798453379422426, "learning_rate": 1.97625e-06, "loss": -0.0075, "num_tokens": 6598212.0, "reward": 1.4806807041168213, "reward_std": 0.07690379023551941, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48068052530288696, "rewards/correct_reward_func/std": 0.208627387881279, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1519.84521484375, "completions/mean_terminated_length": 1439.457763671875, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.0778816199376947, "grad_norm": 0.577416181564331, "kl": 0.013814115896821022, "learning_rate": 1.975625e-06, "loss": 0.0698, "num_tokens": 6731633.0, "reward": 1.3718026876449585, "reward_std": 0.08667115122079849, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.37180256843566895, "rewards/correct_reward_func/std": 0.1464298814535141, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 1481.9881591796875, "completions/mean_terminated_length": 1481.9881591796875, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.0794392523364486, "grad_norm": 0.5731110572814941, "kl": 0.014519122894853354, "learning_rate": 1.975e-06, "loss": 0.0268, "num_tokens": 6861940.0, "reward": 1.4318668842315674, "reward_std": 0.10813824832439423, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.443771630525589, "rewards/correct_reward_func/std": 0.155172199010849, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1394.15478515625, "completions/mean_terminated_length": 1394.15478515625, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.08099688473520249, "grad_norm": 0.6160910129547119, "kl": 0.01515409117564559, "learning_rate": 1.974375e-06, "loss": -0.0362, "num_tokens": 6984959.0, "reward": 1.4249346256256104, "reward_std": 0.06116212159395218, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.424934446811676, "rewards/correct_reward_func/std": 0.15084582567214966, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1428.0238037109375, "completions/mean_terminated_length": 1428.0238037109375, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.08255451713395638, "grad_norm": 0.6062555909156799, "kl": 0.015089603140950203, "learning_rate": 1.97375e-06, "loss": 0.0005, "num_tokens": 7110697.0, "reward": 1.427535891532898, "reward_std": 0.11901802569627762, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4394405484199524, "rewards/correct_reward_func/std": 0.17434334754943848, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1549.46435546875, "completions/mean_terminated_length": 1469.4337158203125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.08411214953271028, "grad_norm": 0.5520058274269104, "kl": 0.014234152156859636, "learning_rate": 1.973125e-06, "loss": 0.0763, "num_tokens": 7246990.0, "reward": 1.5137581825256348, "reward_std": 0.0792492926120758, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5137581825256348, "rewards/correct_reward_func/std": 0.1610475480556488, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1853.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 1293.4405517578125, "completions/mean_terminated_length": 1293.4405517578125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.08566978193146417, "grad_norm": 0.605660617351532, "kl": 0.01542581431567669, "learning_rate": 1.9724999999999997e-06, "loss": -0.038, "num_tokens": 7361321.0, "reward": 1.4857640266418457, "reward_std": 0.10166757553815842, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4857640564441681, "rewards/correct_reward_func/std": 0.16004827618598938, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 1419.96435546875, "completions/mean_terminated_length": 1419.96435546875, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.08722741433021806, "grad_norm": 0.5911623239517212, "kl": 0.015617348719388247, "learning_rate": 1.971875e-06, "loss": 0.0036, "num_tokens": 7486292.0, "reward": 1.376123309135437, "reward_std": 0.09990442544221878, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.38802799582481384, "rewards/correct_reward_func/std": 0.12811584770679474, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2756.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 1440.84521484375, "completions/mean_terminated_length": 1440.84521484375, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.08878504672897196, "grad_norm": 0.5922889113426208, "kl": 0.016300208866596222, "learning_rate": 1.97125e-06, "loss": 0.0113, "num_tokens": 7613335.0, "reward": 1.4130823612213135, "reward_std": 0.1000562384724617, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42498698830604553, "rewards/correct_reward_func/std": 0.11995380371809006, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1405.3214111328125, "completions/mean_terminated_length": 1405.3214111328125, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.09034267912772585, "grad_norm": 0.5695552825927734, "kl": 0.015375382732599974, "learning_rate": 1.970625e-06, "loss": 0.0023, "num_tokens": 7737406.0, "reward": 1.4474685192108154, "reward_std": 0.12695710361003876, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4712778627872467, "rewards/correct_reward_func/std": 0.16331063210964203, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6591.0, "completions/max_terminated_length": 6591.0, "completions/mean_length": 1408.65478515625, "completions/mean_terminated_length": 1408.65478515625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09190031152647975, "grad_norm": 0.5550790429115295, "kl": 0.015237292740494013, "learning_rate": 1.9699999999999998e-06, "loss": -0.0213, "num_tokens": 7861643.0, "reward": 1.4678882360458374, "reward_std": 0.09509699046611786, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46788811683654785, "rewards/correct_reward_func/std": 0.1579650342464447, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1376.84521484375, "completions/mean_terminated_length": 1376.84521484375, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.09345794392523364, "grad_norm": 0.5696773529052734, "kl": 0.01649821363389492, "learning_rate": 1.969375e-06, "loss": -0.0025, "num_tokens": 7983274.0, "reward": 1.4070838689804077, "reward_std": 0.061898693442344666, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4070839285850525, "rewards/correct_reward_func/std": 0.1115923598408699, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 1368.8214111328125, "completions/mean_terminated_length": 1368.8214111328125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.09501557632398754, "grad_norm": 0.6040320992469788, "kl": 0.017192344181239605, "learning_rate": 1.96875e-06, "loss": 0.0044, "num_tokens": 8104279.0, "reward": 1.503865361213684, "reward_std": 0.10960246622562408, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5157700777053833, "rewards/correct_reward_func/std": 0.17495499551296234, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 1308.5, "completions/mean_terminated_length": 1308.5, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 0.09657320872274143, "grad_norm": 0.6219011545181274, "kl": 0.017216363921761513, "learning_rate": 1.968125e-06, "loss": 0.0128, "num_tokens": 8219905.0, "reward": 1.4492701292037964, "reward_std": 0.0713193342089653, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4492699205875397, "rewards/correct_reward_func/std": 0.179514080286026, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 1346.261962890625, "completions/mean_terminated_length": 1346.261962890625, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.09813084112149532, "grad_norm": 0.5801587104797363, "kl": 0.01690333615988493, "learning_rate": 1.9675e-06, "loss": 0.0077, "num_tokens": 8338913.0, "reward": 1.456557273864746, "reward_std": 0.11657059192657471, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46846190094947815, "rewards/correct_reward_func/std": 0.1239844486117363, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 1346.7381591796875, "completions/mean_terminated_length": 1346.7381591796875, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.09968847352024922, "grad_norm": 0.6113296747207642, "kl": 0.017923656851053238, "learning_rate": 1.9668749999999997e-06, "loss": -0.0187, "num_tokens": 8458009.0, "reward": 1.448889136314392, "reward_std": 0.07096786797046661, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4488890469074249, "rewards/correct_reward_func/std": 0.15353722870349884, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2639.0, "completions/max_terminated_length": 2639.0, "completions/mean_length": 1402.4761962890625, "completions/mean_terminated_length": 1402.4761962890625, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.10124610591900311, "grad_norm": 0.5536694526672363, "kl": 0.017508030869066715, "learning_rate": 1.96625e-06, "loss": -0.0024, "num_tokens": 8581955.0, "reward": 1.4375591278076172, "reward_std": 0.12363146990537643, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4613686203956604, "rewards/correct_reward_func/std": 0.16242319345474243, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2007.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1408.5357666015625, "completions/mean_terminated_length": 1408.5357666015625, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 0.102803738317757, "grad_norm": 0.5778936743736267, "kl": 0.018688876181840897, "learning_rate": 1.965625e-06, "loss": 0.0091, "num_tokens": 8706158.0, "reward": 1.4179571866989136, "reward_std": 0.08643031865358353, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41795703768730164, "rewards/correct_reward_func/std": 0.14004966616630554, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2824.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 1323.7857666015625, "completions/mean_terminated_length": 1323.7857666015625, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.1043613707165109, "grad_norm": 0.6043643355369568, "kl": 0.01887867320328951, "learning_rate": 1.965e-06, "loss": -0.0489, "num_tokens": 8823284.0, "reward": 1.4494088888168335, "reward_std": 0.1078440248966217, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4613136053085327, "rewards/correct_reward_func/std": 0.16942912340164185, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1402.0714111328125, "completions/mean_terminated_length": 1402.0714111328125, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 0.1059190031152648, "grad_norm": 0.5758241415023804, "kl": 0.01884887833148241, "learning_rate": 1.9643749999999997e-06, "loss": 0.0126, "num_tokens": 8947046.0, "reward": 1.4188536405563354, "reward_std": 0.12322477996349335, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4426631033420563, "rewards/correct_reward_func/std": 0.137950137257576, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1305.75, "completions/mean_terminated_length": 1305.75, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.10747663551401869, "grad_norm": 0.634208083152771, "kl": 0.01928142551332712, "learning_rate": 1.96375e-06, "loss": 0.0068, "num_tokens": 9062675.0, "reward": 1.4428762197494507, "reward_std": 0.0850701555609703, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44287601113319397, "rewards/correct_reward_func/std": 0.13382165133953094, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 1331.6190185546875, "completions/mean_terminated_length": 1331.6190185546875, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.10903426791277258, "grad_norm": 0.5981489419937134, "kl": 0.019094611518085003, "learning_rate": 1.963125e-06, "loss": 0.0136, "num_tokens": 9180489.0, "reward": 1.4353286027908325, "reward_std": 0.1428486853837967, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.47104281187057495, "rewards/correct_reward_func/std": 0.14202405512332916, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 1393.84521484375, "completions/mean_terminated_length": 1311.939697265625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.11059190031152648, "grad_norm": 0.6035619378089905, "kl": 0.018923446536064148, "learning_rate": 1.9625e-06, "loss": 0.0341, "num_tokens": 9303716.0, "reward": 1.4664435386657715, "reward_std": 0.09690098464488983, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4783483147621155, "rewards/correct_reward_func/std": 0.16768568754196167, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 1333.5833740234375, "completions/mean_terminated_length": 1333.5833740234375, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.11214953271028037, "grad_norm": 0.6100907325744629, "kl": 0.020233074203133583, "learning_rate": 1.9618749999999997e-06, "loss": 0.0087, "num_tokens": 9421647.0, "reward": 1.5223532915115356, "reward_std": 0.09191029518842697, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5342578887939453, "rewards/correct_reward_func/std": 0.14939941465854645, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1924.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 1286.8333740234375, "completions/mean_terminated_length": 1286.8333740234375, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.11370716510903427, "grad_norm": 0.599524736404419, "kl": 0.020186283625662327, "learning_rate": 1.9612499999999996e-06, "loss": -0.0046, "num_tokens": 9535795.0, "reward": 1.5099772214889526, "reward_std": 0.08711431175470352, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5099770426750183, "rewards/correct_reward_func/std": 0.15553654730319977, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1436.34521484375, "completions/mean_terminated_length": 1436.34521484375, "completions/min_length": 1035.0, "completions/min_terminated_length": 1035.0, "epoch": 0.11526479750778816, "grad_norm": 0.5551663041114807, "kl": 0.019929789006710052, "learning_rate": 1.960625e-06, "loss": 0.0163, "num_tokens": 9662448.0, "reward": 1.4788012504577637, "reward_std": 0.06518861651420593, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4788011312484741, "rewards/correct_reward_func/std": 0.1376221776008606, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 1261.25, "completions/mean_terminated_length": 1261.25, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 0.11682242990654206, "grad_norm": 0.6235907673835754, "kl": 0.020636904053390026, "learning_rate": 1.96e-06, "loss": 0.0244, "num_tokens": 9774249.0, "reward": 1.4924941062927246, "reward_std": 0.11271940171718597, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.492494136095047, "rewards/correct_reward_func/std": 0.1804288774728775, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 1304.4405517578125, "completions/mean_terminated_length": 1304.4405517578125, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.11838006230529595, "grad_norm": 0.5913470983505249, "kl": 0.021659635938704014, "learning_rate": 1.959375e-06, "loss": -0.0261, "num_tokens": 9889840.0, "reward": 1.5048737525939941, "reward_std": 0.07548126578330994, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5048737525939941, "rewards/correct_reward_func/std": 0.15308529138565063, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2124.0, "completions/max_terminated_length": 2124.0, "completions/mean_length": 1349.297607421875, "completions/mean_terminated_length": 1349.297607421875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.11993769470404984, "grad_norm": 0.6202085018157959, "kl": 0.0223425030708313, "learning_rate": 1.95875e-06, "loss": -0.0086, "num_tokens": 10009103.0, "reward": 1.4445719718933105, "reward_std": 0.07185468822717667, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44457200169563293, "rewards/correct_reward_func/std": 0.134343683719635, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1348.916748046875, "completions/mean_terminated_length": 1348.916748046875, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 0.12149532710280374, "grad_norm": 0.5950252413749695, "kl": 0.022254208102822304, "learning_rate": 1.958125e-06, "loss": -0.0042, "num_tokens": 10128292.0, "reward": 1.423844575881958, "reward_std": 0.1012512668967247, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43574920296669006, "rewards/correct_reward_func/std": 0.14782190322875977, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1300.7738037109375, "completions/mean_terminated_length": 1300.7738037109375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.12305295950155763, "grad_norm": 0.6564156413078308, "kl": 0.022449446842074394, "learning_rate": 1.9575e-06, "loss": -0.0301, "num_tokens": 10243383.0, "reward": 1.4560483694076538, "reward_std": 0.09143143892288208, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45604828000068665, "rewards/correct_reward_func/std": 0.18612980842590332, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1269.9761962890625, "completions/mean_terminated_length": 1269.9761962890625, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.12461059190031153, "grad_norm": 0.5963668823242188, "kl": 0.02302556298673153, "learning_rate": 1.956875e-06, "loss": 0.0111, "num_tokens": 10355959.0, "reward": 1.4893500804901123, "reward_std": 0.060889869928359985, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48934999108314514, "rewards/correct_reward_func/std": 0.17626559734344482, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 1332.357177734375, "completions/mean_terminated_length": 1332.357177734375, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.1261682242990654, "grad_norm": 0.579901397228241, "kl": 0.023500431329011917, "learning_rate": 1.95625e-06, "loss": -0.0165, "num_tokens": 10473763.0, "reward": 1.4350974559783936, "reward_std": 0.09430722892284393, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.435097336769104, "rewards/correct_reward_func/std": 0.17246632277965546, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 1330.65478515625, "completions/mean_terminated_length": 1330.65478515625, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 0.1277258566978193, "grad_norm": 0.6202122569084167, "kl": 0.02360576204955578, "learning_rate": 1.955625e-06, "loss": -0.0197, "num_tokens": 10591580.0, "reward": 1.4157038927078247, "reward_std": 0.10765408724546432, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4276086390018463, "rewards/correct_reward_func/std": 0.12850892543792725, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 1427.7738037109375, "completions/mean_terminated_length": 1346.277099609375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.1292834890965732, "grad_norm": 0.6136038899421692, "kl": 0.023545796051621437, "learning_rate": 1.955e-06, "loss": 0.0744, "num_tokens": 10717513.0, "reward": 1.4062758684158325, "reward_std": 0.1044679582118988, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4181804656982422, "rewards/correct_reward_func/std": 0.12793242931365967, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1828.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 1328.46435546875, "completions/mean_terminated_length": 1328.46435546875, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 0.1308411214953271, "grad_norm": 0.6154831051826477, "kl": 0.024212509393692017, "learning_rate": 1.954375e-06, "loss": -0.0025, "num_tokens": 10835026.0, "reward": 1.4841961860656738, "reward_std": 0.08680614829063416, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48419615626335144, "rewards/correct_reward_func/std": 0.18122969567775726, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1370.1905517578125, "completions/mean_terminated_length": 1370.1905517578125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.13239875389408098, "grad_norm": 0.5834784507751465, "kl": 0.02425501774996519, "learning_rate": 1.95375e-06, "loss": 0.0084, "num_tokens": 10956050.0, "reward": 1.4767500162124634, "reward_std": 0.0894772931933403, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4767499566078186, "rewards/correct_reward_func/std": 0.17486557364463806, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 1371.547607421875, "completions/mean_terminated_length": 1371.547607421875, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.13395638629283488, "grad_norm": 0.5802989602088928, "kl": 0.024927244521677494, "learning_rate": 1.953125e-06, "loss": 0.015, "num_tokens": 11077188.0, "reward": 1.4462058544158936, "reward_std": 0.07399098575115204, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4462057650089264, "rewards/correct_reward_func/std": 0.12212073057889938, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 1386.9405517578125, "completions/mean_terminated_length": 1386.9405517578125, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.13551401869158877, "grad_norm": 0.5793676376342773, "kl": 0.024698903784155846, "learning_rate": 1.9525e-06, "loss": -0.0095, "num_tokens": 11199721.0, "reward": 1.4632221460342407, "reward_std": 0.11924762278795242, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47512686252593994, "rewards/correct_reward_func/std": 0.20697803795337677, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1388.047607421875, "completions/mean_terminated_length": 1388.047607421875, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 0.13707165109034267, "grad_norm": 0.5643482804298401, "kl": 0.02437182515859604, "learning_rate": 1.951875e-06, "loss": -0.0021, "num_tokens": 11322299.0, "reward": 1.4515498876571655, "reward_std": 0.08298921585083008, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45154979825019836, "rewards/correct_reward_func/std": 0.13497759401798248, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1484.107177734375, "completions/mean_terminated_length": 1403.2890625, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.13862928348909656, "grad_norm": 0.5531883239746094, "kl": 0.023836837150156498, "learning_rate": 1.9512499999999997e-06, "loss": 0.0771, "num_tokens": 11453042.0, "reward": 1.4721571207046509, "reward_std": 0.07365047186613083, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4721570611000061, "rewards/correct_reward_func/std": 0.17172464728355408, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1920.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1399.9881591796875, "completions/mean_terminated_length": 1399.9881591796875, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.14018691588785046, "grad_norm": 0.5798895359039307, "kl": 0.024346785619854927, "learning_rate": 1.950625e-06, "loss": 0.0128, "num_tokens": 11576779.0, "reward": 1.4762535095214844, "reward_std": 0.07557668536901474, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4762535095214844, "rewards/correct_reward_func/std": 0.11668115109205246, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 1411.7261962890625, "completions/mean_terminated_length": 1411.7261962890625, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 0.14174454828660435, "grad_norm": 0.5864601731300354, "kl": 0.025230017490684986, "learning_rate": 1.95e-06, "loss": -0.0058, "num_tokens": 11701376.0, "reward": 1.43711256980896, "reward_std": 0.06144386902451515, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4371124505996704, "rewards/correct_reward_func/std": 0.11439383029937744, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3613.0, "completions/max_terminated_length": 3613.0, "completions/mean_length": 1334.6190185546875, "completions/mean_terminated_length": 1334.6190185546875, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "epoch": 0.14330218068535824, "grad_norm": 0.607031524181366, "kl": 0.026643778197467327, "learning_rate": 1.949375e-06, "loss": 0.0048, "num_tokens": 11819346.0, "reward": 1.4904909133911133, "reward_std": 0.0712086409330368, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4904908537864685, "rewards/correct_reward_func/std": 0.11422253400087357, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1322.666748046875, "completions/mean_terminated_length": 1322.666748046875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.14485981308411214, "grad_norm": 0.6401565074920654, "kl": 0.024812299758195877, "learning_rate": 1.9487499999999998e-06, "loss": -0.0045, "num_tokens": 11936330.0, "reward": 1.3906474113464355, "reward_std": 0.07590549439191818, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4025520086288452, "rewards/correct_reward_func/std": 0.16384169459342957, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3187.0, "completions/max_terminated_length": 3187.0, "completions/mean_length": 1393.202392578125, "completions/mean_terminated_length": 1393.202392578125, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 0.14641744548286603, "grad_norm": 0.6122695803642273, "kl": 0.025386733002960682, "learning_rate": 1.948125e-06, "loss": 0.0086, "num_tokens": 12059407.0, "reward": 1.562843680381775, "reward_std": 0.11002606898546219, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5747482776641846, "rewards/correct_reward_func/std": 0.1576448678970337, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2248.0, "completions/max_terminated_length": 2248.0, "completions/mean_length": 1397.547607421875, "completions/mean_terminated_length": 1397.547607421875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.14797507788161993, "grad_norm": 0.620469868183136, "kl": 0.02531202882528305, "learning_rate": 1.9475e-06, "loss": -0.0123, "num_tokens": 12182867.0, "reward": 1.3950475454330444, "reward_std": 0.11134982109069824, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4069521725177765, "rewards/correct_reward_func/std": 0.14087031781673431, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 1391.297607421875, "completions/mean_terminated_length": 1391.297607421875, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 0.14953271028037382, "grad_norm": 0.5656567215919495, "kl": 0.025920305401086807, "learning_rate": 1.946875e-06, "loss": 0.0145, "num_tokens": 12305718.0, "reward": 1.447014570236206, "reward_std": 0.08030132949352264, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44701454043388367, "rewards/correct_reward_func/std": 0.14291325211524963, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1303.34521484375, "completions/mean_terminated_length": 1303.34521484375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 0.15109034267912771, "grad_norm": 0.6328920125961304, "kl": 0.026815838180482388, "learning_rate": 1.94625e-06, "loss": 0.0115, "num_tokens": 12421073.0, "reward": 1.4320292472839355, "reward_std": 0.07025571167469025, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4320293068885803, "rewards/correct_reward_func/std": 0.14837507903575897, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2359.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 1374.34521484375, "completions/mean_terminated_length": 1374.34521484375, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.1526479750778816, "grad_norm": 0.564152181148529, "kl": 0.025968145579099655, "learning_rate": 1.9456249999999997e-06, "loss": -0.0224, "num_tokens": 12542284.0, "reward": 1.3898595571517944, "reward_std": 0.11127079278230667, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.41366904973983765, "rewards/correct_reward_func/std": 0.18172919750213623, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1444.761962890625, "completions/mean_terminated_length": 1444.761962890625, "completions/min_length": 925.0, "completions/min_terminated_length": 925.0, "epoch": 0.1542056074766355, "grad_norm": 0.5919306874275208, "kl": 0.02604432962834835, "learning_rate": 1.945e-06, "loss": 0.028, "num_tokens": 12669842.0, "reward": 1.4752211570739746, "reward_std": 0.07153313606977463, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47522109746932983, "rewards/correct_reward_func/std": 0.1358029991388321, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1375.2857666015625, "completions/mean_terminated_length": 1375.2857666015625, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.1557632398753894, "grad_norm": 0.6110028624534607, "kl": 0.02663259394466877, "learning_rate": 1.944375e-06, "loss": 0.0359, "num_tokens": 12791516.0, "reward": 1.4065624475479126, "reward_std": 0.07423868775367737, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4065624475479126, "rewards/correct_reward_func/std": 0.1365046501159668, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 1338.75, "completions/mean_terminated_length": 1338.75, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.1573208722741433, "grad_norm": 0.6527758240699768, "kl": 0.026112915948033333, "learning_rate": 1.94375e-06, "loss": 0.0353, "num_tokens": 12909851.0, "reward": 1.4908164739608765, "reward_std": 0.11687764525413513, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5027210712432861, "rewards/correct_reward_func/std": 0.15153582394123077, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2034.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1339.357177734375, "completions/mean_terminated_length": 1339.357177734375, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 0.1588785046728972, "grad_norm": 0.6052369475364685, "kl": 0.026593846268951893, "learning_rate": 1.9431249999999997e-06, "loss": -0.0369, "num_tokens": 13028333.0, "reward": 1.443188190460205, "reward_std": 0.06765951961278915, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4431880712509155, "rewards/correct_reward_func/std": 0.12718868255615234, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1414.84521484375, "completions/mean_terminated_length": 1333.1927490234375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.16043613707165108, "grad_norm": 0.5841978788375854, "kl": 0.027741556987166405, "learning_rate": 1.9424999999999996e-06, "loss": 0.0426, "num_tokens": 13153132.0, "reward": 1.458450198173523, "reward_std": 0.10668490082025528, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4584501385688782, "rewards/correct_reward_func/std": 0.17332585155963898, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1408.5238037109375, "completions/mean_terminated_length": 1408.5238037109375, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.16199376947040497, "grad_norm": 0.574649453163147, "kl": 0.026361594907939434, "learning_rate": 1.941875e-06, "loss": 0.0063, "num_tokens": 13277502.0, "reward": 1.4935458898544312, "reward_std": 0.06738085299730301, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.493545800447464, "rewards/correct_reward_func/std": 0.17171715199947357, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 1397.2261962890625, "completions/mean_terminated_length": 1397.2261962890625, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.16355140186915887, "grad_norm": 0.6107763648033142, "kl": 0.028758167289197445, "learning_rate": 1.94125e-06, "loss": -0.0053, "num_tokens": 13401001.0, "reward": 1.4977682828903198, "reward_std": 0.07337197661399841, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49776825308799744, "rewards/correct_reward_func/std": 0.16519995033740997, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1723.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 1302.9405517578125, "completions/mean_terminated_length": 1302.9405517578125, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 0.16510903426791276, "grad_norm": 0.5974235534667969, "kl": 0.02774975076317787, "learning_rate": 1.940625e-06, "loss": 0.0079, "num_tokens": 13516292.0, "reward": 1.500797152519226, "reward_std": 0.07562069594860077, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5007970333099365, "rewards/correct_reward_func/std": 0.1385519951581955, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1333.3214111328125, "completions/mean_terminated_length": 1333.3214111328125, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.16666666666666666, "grad_norm": 0.5899143218994141, "kl": 0.02739392127841711, "learning_rate": 1.94e-06, "loss": -0.0042, "num_tokens": 13634147.0, "reward": 1.3661153316497803, "reward_std": 0.1458723396062851, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.4018295407295227, "rewards/correct_reward_func/std": 0.14165017008781433, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 1340.2261962890625, "completions/mean_terminated_length": 1340.2261962890625, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "epoch": 0.16822429906542055, "grad_norm": 0.6039218902587891, "kl": 0.027482734993100166, "learning_rate": 1.939375e-06, "loss": 0.0001, "num_tokens": 13752750.0, "reward": 1.4691303968429565, "reward_std": 0.07967161387205124, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.469130277633667, "rewards/correct_reward_func/std": 0.17357668280601501, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1375.047607421875, "completions/mean_terminated_length": 1375.047607421875, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.16978193146417445, "grad_norm": 0.575854480266571, "kl": 0.029047698713839054, "learning_rate": 1.93875e-06, "loss": -0.0041, "num_tokens": 13874344.0, "reward": 1.4227412939071655, "reward_std": 0.07607690989971161, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42274120450019836, "rewards/correct_reward_func/std": 0.12703333795070648, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 1464.4285888671875, "completions/mean_terminated_length": 1383.3734130859375, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 0.17133956386292834, "grad_norm": 0.5789852738380432, "kl": 0.02678022440522909, "learning_rate": 1.938125e-06, "loss": 0.0679, "num_tokens": 14003362.0, "reward": 1.4550813436508179, "reward_std": 0.09208068251609802, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4550813138484955, "rewards/correct_reward_func/std": 0.12678542733192444, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1335.797607421875, "completions/mean_terminated_length": 1335.797607421875, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 0.17289719626168223, "grad_norm": 0.5807675719261169, "kl": 0.027965486980974674, "learning_rate": 1.9375e-06, "loss": 0.0215, "num_tokens": 14121389.0, "reward": 1.4275166988372803, "reward_std": 0.09834519028663635, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43942132592201233, "rewards/correct_reward_func/std": 0.17403005063533783, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 1329.2738037109375, "completions/mean_terminated_length": 1329.2738037109375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.17445482866043613, "grad_norm": 0.6370654702186584, "kl": 0.03011655993759632, "learning_rate": 1.936875e-06, "loss": -0.0064, "num_tokens": 14239240.0, "reward": 1.4735430479049683, "reward_std": 0.07655790448188782, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4735429883003235, "rewards/correct_reward_func/std": 0.1377311646938324, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 1350.702392578125, "completions/mean_terminated_length": 1350.702392578125, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.17601246105919002, "grad_norm": 0.5633478164672852, "kl": 0.027842647396028042, "learning_rate": 1.93625e-06, "loss": 0.0033, "num_tokens": 14358717.0, "reward": 1.4617105722427368, "reward_std": 0.07503892481327057, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46171048283576965, "rewards/correct_reward_func/std": 0.11019705981016159, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1946.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 1320.6785888671875, "completions/mean_terminated_length": 1320.6785888671875, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.17757009345794392, "grad_norm": 0.622870683670044, "kl": 0.02942818123847246, "learning_rate": 1.9356249999999998e-06, "loss": -0.0117, "num_tokens": 14475876.0, "reward": 1.4551728963851929, "reward_std": 0.07278000563383102, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4551727771759033, "rewards/correct_reward_func/std": 0.12961725890636444, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 1367.547607421875, "completions/mean_terminated_length": 1367.547607421875, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 0.1791277258566978, "grad_norm": 0.5723932385444641, "kl": 0.027969708666205406, "learning_rate": 1.935e-06, "loss": 0.0019, "num_tokens": 14596690.0, "reward": 1.424649715423584, "reward_std": 0.10459105670452118, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4365543723106384, "rewards/correct_reward_func/std": 0.13125936686992645, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 1496.8690185546875, "completions/mean_terminated_length": 1416.2047119140625, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 0.1806853582554517, "grad_norm": 0.5445118546485901, "kl": 0.027951962314546108, "learning_rate": 1.934375e-06, "loss": 0.0382, "num_tokens": 14728457.0, "reward": 1.4374322891235352, "reward_std": 0.08895470947027206, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4374321401119232, "rewards/correct_reward_func/std": 0.1454283893108368, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1406.547607421875, "completions/mean_terminated_length": 1406.547607421875, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.1822429906542056, "grad_norm": 0.5892264246940613, "kl": 0.02943518850952387, "learning_rate": 1.93375e-06, "loss": 0.0251, "num_tokens": 14852511.0, "reward": 1.4491535425186157, "reward_std": 0.07768179476261139, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44915345311164856, "rewards/correct_reward_func/std": 0.14331160485744476, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 1454.1429443359375, "completions/mean_terminated_length": 1454.1429443359375, "completions/min_length": 991.0, "completions/min_terminated_length": 991.0, "epoch": 0.1838006230529595, "grad_norm": 0.5593940615653992, "kl": 0.028669409453868866, "learning_rate": 1.933125e-06, "loss": -0.0092, "num_tokens": 14980683.0, "reward": 1.4328858852386475, "reward_std": 0.06446022540330887, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4328858554363251, "rewards/correct_reward_func/std": 0.15252062678337097, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1976.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1330.416748046875, "completions/mean_terminated_length": 1330.416748046875, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.1853582554517134, "grad_norm": 0.6204725503921509, "kl": 0.031036019325256348, "learning_rate": 1.9325e-06, "loss": 0.0538, "num_tokens": 15098204.0, "reward": 1.4865161180496216, "reward_std": 0.06775263696908951, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4865160584449768, "rewards/correct_reward_func/std": 0.1286322921514511, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 1421.0714111328125, "completions/mean_terminated_length": 1421.0714111328125, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.18691588785046728, "grad_norm": 0.6097022294998169, "kl": 0.028833536431193352, "learning_rate": 1.931875e-06, "loss": -0.0063, "num_tokens": 15223664.0, "reward": 1.4651082754135132, "reward_std": 0.08360718935728073, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4770130515098572, "rewards/correct_reward_func/std": 0.15723736584186554, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 1469.916748046875, "completions/mean_terminated_length": 1469.916748046875, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.18847352024922118, "grad_norm": 0.5351011157035828, "kl": 0.028757021762430668, "learning_rate": 1.93125e-06, "loss": 0.0368, "num_tokens": 15353281.0, "reward": 1.4508754014968872, "reward_std": 0.06538330763578415, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4508753716945648, "rewards/correct_reward_func/std": 0.14440658688545227, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 1432.6190185546875, "completions/mean_terminated_length": 1432.6190185546875, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.19003115264797507, "grad_norm": 0.5838773846626282, "kl": 0.030190047807991505, "learning_rate": 1.930625e-06, "loss": 0.0004, "num_tokens": 15479627.0, "reward": 1.5679670572280884, "reward_std": 0.08373278379440308, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5679671168327332, "rewards/correct_reward_func/std": 0.17479771375656128, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 1383.1905517578125, "completions/mean_terminated_length": 1383.1905517578125, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "epoch": 0.19158878504672897, "grad_norm": 0.603692889213562, "kl": 0.03224192373454571, "learning_rate": 1.9299999999999997e-06, "loss": 0.0088, "num_tokens": 15601791.0, "reward": 1.4391270875930786, "reward_std": 0.06994114071130753, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4391269087791443, "rewards/correct_reward_func/std": 0.14909610152244568, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 1490.2381591796875, "completions/mean_terminated_length": 1490.2381591796875, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "epoch": 0.19314641744548286, "grad_norm": 0.5941579937934875, "kl": 0.02839325089007616, "learning_rate": 1.929375e-06, "loss": 0.0225, "num_tokens": 15733049.0, "reward": 1.415550947189331, "reward_std": 0.06161380559206009, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41555076837539673, "rewards/correct_reward_func/std": 0.10922452807426453, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 1322.2857666015625, "completions/mean_terminated_length": 1322.2857666015625, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 0.19470404984423675, "grad_norm": 0.5985201001167297, "kl": 0.029462194070219994, "learning_rate": 1.92875e-06, "loss": -0.0175, "num_tokens": 15849953.0, "reward": 1.4787646532058716, "reward_std": 0.09507114440202713, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47876468300819397, "rewards/correct_reward_func/std": 0.1848842203617096, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 1334.8333740234375, "completions/mean_terminated_length": 1334.8333740234375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.19626168224299065, "grad_norm": 0.6323754191398621, "kl": 0.02924549486488104, "learning_rate": 1.928125e-06, "loss": -0.0014, "num_tokens": 15967953.0, "reward": 1.520105242729187, "reward_std": 0.07988641411066055, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5201051831245422, "rewards/correct_reward_func/std": 0.16170603036880493, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 1414.9881591796875, "completions/mean_terminated_length": 1414.9881591796875, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 0.19781931464174454, "grad_norm": 0.5964637994766235, "kl": 0.029463034123182297, "learning_rate": 1.9274999999999998e-06, "loss": 0.0118, "num_tokens": 16092890.0, "reward": 1.4794553518295288, "reward_std": 0.06303998827934265, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47945523262023926, "rewards/correct_reward_func/std": 0.11690139025449753, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 1490.8214111328125, "completions/mean_terminated_length": 1410.084228515625, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.19937694704049844, "grad_norm": 0.5852746963500977, "kl": 0.029941502027213573, "learning_rate": 1.9268749999999997e-06, "loss": 0.0568, "num_tokens": 16224011.0, "reward": 1.4570320844650269, "reward_std": 0.13305586576461792, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4808415472507477, "rewards/correct_reward_func/std": 0.15189340710639954, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 1419.9285888671875, "completions/mean_terminated_length": 1419.9285888671875, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 0.20093457943925233, "grad_norm": 0.6678198575973511, "kl": 0.02869417704641819, "learning_rate": 1.92625e-06, "loss": -0.0245, "num_tokens": 16349273.0, "reward": 1.4819056987762451, "reward_std": 0.0900546982884407, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4819056987762451, "rewards/correct_reward_func/std": 0.13743773102760315, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 1410.511962890625, "completions/mean_terminated_length": 1410.511962890625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.20249221183800623, "grad_norm": 0.5829043984413147, "kl": 0.029658552259206772, "learning_rate": 1.925625e-06, "loss": 0.0304, "num_tokens": 16473606.0, "reward": 1.4438934326171875, "reward_std": 0.08044224977493286, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4438934326171875, "rewards/correct_reward_func/std": 0.15281730890274048, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1419.6190185546875, "completions/mean_terminated_length": 1419.6190185546875, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.20404984423676012, "grad_norm": 0.6060424447059631, "kl": 0.029810849577188492, "learning_rate": 1.9249999999999998e-06, "loss": 0.0186, "num_tokens": 16598698.0, "reward": 1.4231759309768677, "reward_std": 0.13214969635009766, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.4588901996612549, "rewards/correct_reward_func/std": 0.17215143144130707, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1587.047607421875, "completions/mean_terminated_length": 1425.951171875, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.205607476635514, "grad_norm": 0.5848340392112732, "kl": 0.02779593039304018, "learning_rate": 1.9243749999999997e-06, "loss": 0.1231, "num_tokens": 16738082.0, "reward": 1.418549656867981, "reward_std": 0.09781080484390259, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41854962706565857, "rewards/correct_reward_func/std": 0.15713582932949066, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 1363.107177734375, "completions/mean_terminated_length": 1363.107177734375, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.2071651090342679, "grad_norm": 0.6095062494277954, "kl": 0.02982013951987028, "learning_rate": 1.92375e-06, "loss": -0.0032, "num_tokens": 16858487.0, "reward": 1.4797476530075073, "reward_std": 0.08453521132469177, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47974759340286255, "rewards/correct_reward_func/std": 0.13973869383335114, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 1384.3095703125, "completions/mean_terminated_length": 1384.3095703125, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.2087227414330218, "grad_norm": 0.5872117280960083, "kl": 0.03033105470240116, "learning_rate": 1.923125e-06, "loss": -0.0099, "num_tokens": 16980787.0, "reward": 1.4834345579147339, "reward_std": 0.0603872612118721, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48343440890312195, "rewards/correct_reward_func/std": 0.16034552454948425, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 1370.40478515625, "completions/mean_terminated_length": 1370.40478515625, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.2102803738317757, "grad_norm": 0.6103830337524414, "kl": 0.030320947989821434, "learning_rate": 1.9225e-06, "loss": 0.0061, "num_tokens": 17101811.0, "reward": 1.4273531436920166, "reward_std": 0.09265647828578949, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4392578601837158, "rewards/correct_reward_func/std": 0.13740864396095276, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1434.46435546875, "completions/mean_terminated_length": 1434.46435546875, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 0.2118380062305296, "grad_norm": 0.603561520576477, "kl": 0.02980469260364771, "learning_rate": 1.9218749999999997e-06, "loss": -0.0064, "num_tokens": 17228618.0, "reward": 1.4576891660690308, "reward_std": 0.05491868779063225, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45768895745277405, "rewards/correct_reward_func/std": 0.14544495940208435, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 1493.416748046875, "completions/mean_terminated_length": 1493.416748046875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.21339563862928349, "grad_norm": 0.5915752053260803, "kl": 0.028933603316545486, "learning_rate": 1.9212499999999996e-06, "loss": -0.0056, "num_tokens": 17360071.0, "reward": 1.5237936973571777, "reward_std": 0.07290388643741608, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5237936973571777, "rewards/correct_reward_func/std": 0.14304442703723907, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1456.1785888671875, "completions/mean_terminated_length": 1456.1785888671875, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.21495327102803738, "grad_norm": 0.5867950916290283, "kl": 0.030623883940279484, "learning_rate": 1.920625e-06, "loss": 0.0369, "num_tokens": 17488402.0, "reward": 1.424317717552185, "reward_std": 0.09510175883769989, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4362224042415619, "rewards/correct_reward_func/std": 0.15234197676181793, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7913.0, "completions/max_terminated_length": 7913.0, "completions/mean_length": 1533.6190185546875, "completions/mean_terminated_length": 1533.6190185546875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.21651090342679127, "grad_norm": 0.5414514541625977, "kl": 0.02811363060027361, "learning_rate": 1.92e-06, "loss": 0.0618, "num_tokens": 17623412.0, "reward": 1.491133689880371, "reward_std": 0.06729433685541153, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49113360047340393, "rewards/correct_reward_func/std": 0.17636097967624664, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1416.011962890625, "completions/mean_terminated_length": 1416.011962890625, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.21806853582554517, "grad_norm": 0.5571795701980591, "kl": 0.03138226270675659, "learning_rate": 1.919375e-06, "loss": 0.0457, "num_tokens": 17748435.0, "reward": 1.4378496408462524, "reward_std": 0.07887466251850128, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4378494918346405, "rewards/correct_reward_func/std": 0.10594429075717926, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1325.5833740234375, "completions/mean_terminated_length": 1325.5833740234375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.21962616822429906, "grad_norm": 0.5585833191871643, "kl": 0.03183058649301529, "learning_rate": 1.91875e-06, "loss": -0.0038, "num_tokens": 17865712.0, "reward": 1.5206776857376099, "reward_std": 0.089789979159832, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5206776261329651, "rewards/correct_reward_func/std": 0.18472737073898315, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1899.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 1388.392822265625, "completions/mean_terminated_length": 1388.392822265625, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "epoch": 0.22118380062305296, "grad_norm": 0.5522568225860596, "kl": 0.030762070789933205, "learning_rate": 1.918125e-06, "loss": 0.0044, "num_tokens": 17988241.0, "reward": 1.4582120180130005, "reward_std": 0.13537071645259857, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.49392637610435486, "rewards/correct_reward_func/std": 0.1673257201910019, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 1382.2857666015625, "completions/mean_terminated_length": 1382.2857666015625, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.22274143302180685, "grad_norm": 0.6352323889732361, "kl": 0.03129299636930227, "learning_rate": 1.9175e-06, "loss": 0.0264, "num_tokens": 18110221.0, "reward": 1.4690262079238892, "reward_std": 0.0818016454577446, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.469026118516922, "rewards/correct_reward_func/std": 0.152619868516922, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 1394.166748046875, "completions/mean_terminated_length": 1394.166748046875, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 0.22429906542056074, "grad_norm": 0.63814777135849, "kl": 0.03126653470098972, "learning_rate": 1.916875e-06, "loss": 0.0072, "num_tokens": 18233295.0, "reward": 1.5044426918029785, "reward_std": 0.10564389079809189, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5163474082946777, "rewards/correct_reward_func/std": 0.15176692605018616, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 1409.261962890625, "completions/mean_terminated_length": 1409.261962890625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.22585669781931464, "grad_norm": 0.6800863146781921, "kl": 0.03138407226651907, "learning_rate": 1.91625e-06, "loss": 0.0101, "num_tokens": 18357601.0, "reward": 1.3754723072052002, "reward_std": 0.12025143206119537, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.3992818295955658, "rewards/correct_reward_func/std": 0.1352054476737976, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 1371.5833740234375, "completions/mean_terminated_length": 1371.5833740234375, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 0.22741433021806853, "grad_norm": 0.5346211791038513, "kl": 0.03179503232240677, "learning_rate": 1.915625e-06, "loss": 0.0068, "num_tokens": 18478958.0, "reward": 1.4643080234527588, "reward_std": 0.05763059854507446, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46430787444114685, "rewards/correct_reward_func/std": 0.13611404597759247, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1347.75, "completions/mean_terminated_length": 1347.75, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 0.22897196261682243, "grad_norm": 0.6036585569381714, "kl": 0.032050169073045254, "learning_rate": 1.915e-06, "loss": 0.0138, "num_tokens": 18598259.0, "reward": 1.5019899606704712, "reward_std": 0.07086333632469177, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5019899010658264, "rewards/correct_reward_func/std": 0.12254533916711807, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 1349.4761962890625, "completions/mean_terminated_length": 1349.4761962890625, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.23052959501557632, "grad_norm": 0.5723428130149841, "kl": 0.033308178186416626, "learning_rate": 1.9143749999999998e-06, "loss": -0.0342, "num_tokens": 18717507.0, "reward": 1.439958095550537, "reward_std": 0.09097757190465927, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45186278223991394, "rewards/correct_reward_func/std": 0.15231125056743622, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 1306.0714111328125, "completions/mean_terminated_length": 1306.0714111328125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.23208722741433022, "grad_norm": 0.6826386451721191, "kl": 0.040258824825286865, "learning_rate": 1.91375e-06, "loss": -0.0055, "num_tokens": 18833091.0, "reward": 1.432396411895752, "reward_std": 0.0788629949092865, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4323963522911072, "rewards/correct_reward_func/std": 0.1607416421175003, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 1339.7857666015625, "completions/mean_terminated_length": 1339.7857666015625, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.2336448598130841, "grad_norm": 0.6238492131233215, "kl": 0.033289359882473946, "learning_rate": 1.913125e-06, "loss": -0.0105, "num_tokens": 18951591.0, "reward": 1.4412423372268677, "reward_std": 0.06025463342666626, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44124215841293335, "rewards/correct_reward_func/std": 0.1178692877292633, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 1272.0357666015625, "completions/mean_terminated_length": 1272.0357666015625, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 0.235202492211838, "grad_norm": 0.6190460920333862, "kl": 0.034545375034213066, "learning_rate": 1.9125e-06, "loss": -0.0009, "num_tokens": 19064406.0, "reward": 1.402614951133728, "reward_std": 0.054387416690588, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.40261486172676086, "rewards/correct_reward_func/std": 0.13192394375801086, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6207.0, "completions/max_terminated_length": 6207.0, "completions/mean_length": 1383.2738037109375, "completions/mean_terminated_length": 1383.2738037109375, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.2367601246105919, "grad_norm": 0.6016775369644165, "kl": 0.03410913795232773, "learning_rate": 1.911875e-06, "loss": -0.0575, "num_tokens": 19186433.0, "reward": 1.465145468711853, "reward_std": 0.06573602557182312, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46514537930488586, "rewards/correct_reward_func/std": 0.13837113976478577, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1315.6309814453125, "completions/mean_terminated_length": 1315.6309814453125, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.2383177570093458, "grad_norm": 0.6019495725631714, "kl": 0.03409886732697487, "learning_rate": 1.9112499999999997e-06, "loss": -0.0083, "num_tokens": 19302892.0, "reward": 1.4242923259735107, "reward_std": 0.0667426809668541, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4242922365665436, "rewards/correct_reward_func/std": 0.13553784787654877, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 1342.0833740234375, "completions/mean_terminated_length": 1342.0833740234375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2398753894080997, "grad_norm": 0.5771262049674988, "kl": 0.0346537921577692, "learning_rate": 1.910625e-06, "loss": -0.0074, "num_tokens": 19421651.0, "reward": 1.4762444496154785, "reward_std": 0.08208738267421722, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4762443006038666, "rewards/correct_reward_func/std": 0.19542856514453888, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 1312.6905517578125, "completions/mean_terminated_length": 1312.6905517578125, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.24143302180685358, "grad_norm": 0.5773271918296814, "kl": 0.03536880388855934, "learning_rate": 1.91e-06, "loss": -0.0149, "num_tokens": 19537893.0, "reward": 1.4299932718276978, "reward_std": 0.06326950341463089, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.429993212223053, "rewards/correct_reward_func/std": 0.15807007253170013, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1352.166748046875, "completions/mean_terminated_length": 1352.166748046875, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 0.24299065420560748, "grad_norm": 0.6289856433868408, "kl": 0.03449527733027935, "learning_rate": 1.909375e-06, "loss": -0.003, "num_tokens": 19657355.0, "reward": 1.4612387418746948, "reward_std": 0.06969407945871353, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46123871207237244, "rewards/correct_reward_func/std": 0.1171262189745903, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1960.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1367.142822265625, "completions/mean_terminated_length": 1367.142822265625, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.24454828660436137, "grad_norm": 0.5991291999816895, "kl": 0.034693608060479164, "learning_rate": 1.9087499999999997e-06, "loss": 0.0271, "num_tokens": 19778057.0, "reward": 1.3796496391296387, "reward_std": 0.056341852992773056, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3796495795249939, "rewards/correct_reward_func/std": 0.11140848696231842, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2143.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 1404.4881591796875, "completions/mean_terminated_length": 1404.4881591796875, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 0.24610591900311526, "grad_norm": 0.5495375990867615, "kl": 0.03542102687060833, "learning_rate": 1.908125e-06, "loss": 0.0074, "num_tokens": 19902070.0, "reward": 1.47572660446167, "reward_std": 0.10572995990514755, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.49953609704971313, "rewards/correct_reward_func/std": 0.17438319325447083, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1494.416748046875, "completions/mean_terminated_length": 1413.7227783203125, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.24766355140186916, "grad_norm": 0.5821869373321533, "kl": 0.03435787186026573, "learning_rate": 1.9075e-06, "loss": 0.0632, "num_tokens": 20033679.0, "reward": 1.4355047941207886, "reward_std": 0.0911315381526947, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44740939140319824, "rewards/correct_reward_func/std": 0.15956489741802216, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 1374.7857666015625, "completions/mean_terminated_length": 1374.7857666015625, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "epoch": 0.24922118380062305, "grad_norm": 0.6089724898338318, "kl": 0.033023279160261154, "learning_rate": 1.906875e-06, "loss": 0.0046, "num_tokens": 20155203.0, "reward": 1.4912819862365723, "reward_std": 0.05513819307088852, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4912818670272827, "rewards/correct_reward_func/std": 0.16794303059577942, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 1431.2381591796875, "completions/mean_terminated_length": 1431.2381591796875, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 0.2507788161993769, "grad_norm": 0.5501843690872192, "kl": 0.03268396854400635, "learning_rate": 1.90625e-06, "loss": 0.0046, "num_tokens": 20281463.0, "reward": 1.4660686254501343, "reward_std": 0.0724034234881401, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4660685062408447, "rewards/correct_reward_func/std": 0.16601873934268951, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 1548.9881591796875, "completions/mean_terminated_length": 1468.9517822265625, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "epoch": 0.2523364485981308, "grad_norm": 0.5610131621360779, "kl": 0.03170663956552744, "learning_rate": 1.9056249999999999e-06, "loss": 0.0445, "num_tokens": 20417668.0, "reward": 1.451736569404602, "reward_std": 0.09086348861455917, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4517364501953125, "rewards/correct_reward_func/std": 0.17963163554668427, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1610.5, "completions/mean_terminated_length": 1449.9755859375, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 0.2538940809968847, "grad_norm": 0.5172660946846008, "kl": 0.030543990433216095, "learning_rate": 1.905e-06, "loss": 0.1107, "num_tokens": 20558926.0, "reward": 1.4814132452011108, "reward_std": 0.09557101875543594, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4814131557941437, "rewards/correct_reward_func/std": 0.1708441823720932, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1437.416748046875, "completions/mean_terminated_length": 1437.416748046875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.2554517133956386, "grad_norm": 0.5784984230995178, "kl": 0.03393215127289295, "learning_rate": 1.9043749999999999e-06, "loss": 0.0018, "num_tokens": 20685663.0, "reward": 1.5089834928512573, "reward_std": 0.08463006466627121, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5208882093429565, "rewards/correct_reward_func/std": 0.1223825141787529, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3265.0, "completions/max_terminated_length": 3265.0, "completions/mean_length": 1466.6190185546875, "completions/mean_terminated_length": 1466.6190185546875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.2570093457943925, "grad_norm": 0.6114115715026855, "kl": 0.03406968712806702, "learning_rate": 1.90375e-06, "loss": 0.0058, "num_tokens": 20814859.0, "reward": 1.4296975135803223, "reward_std": 0.08567629754543304, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4296974539756775, "rewards/correct_reward_func/std": 0.12944677472114563, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6028.0, "completions/max_terminated_length": 6028.0, "completions/mean_length": 1430.15478515625, "completions/mean_terminated_length": 1430.15478515625, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.2585669781931464, "grad_norm": 0.6096552014350891, "kl": 0.03403060883283615, "learning_rate": 1.9031249999999999e-06, "loss": 0.0408, "num_tokens": 20940830.0, "reward": 1.4890365600585938, "reward_std": 0.05935479328036308, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4890367090702057, "rewards/correct_reward_func/std": 0.17837880551815033, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 1496.1785888671875, "completions/mean_terminated_length": 1415.5059814453125, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.2601246105919003, "grad_norm": 0.5552809238433838, "kl": 0.03266907203942537, "learning_rate": 1.9025e-06, "loss": 0.0631, "num_tokens": 21072461.0, "reward": 1.4831464290618896, "reward_std": 0.07709907740354538, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48314639925956726, "rewards/correct_reward_func/std": 0.1887180060148239, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1424.15478515625, "completions/mean_terminated_length": 1424.15478515625, "completions/min_length": 1038.0, "completions/min_terminated_length": 1038.0, "epoch": 0.2616822429906542, "grad_norm": 0.5698901414871216, "kl": 0.03557535447180271, "learning_rate": 1.901875e-06, "loss": 0.0045, "num_tokens": 21198000.0, "reward": 1.4246412515640259, "reward_std": 0.07475357502698898, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4246411919593811, "rewards/correct_reward_func/std": 0.12371546775102615, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3198.0, "completions/max_terminated_length": 3198.0, "completions/mean_length": 1557.857177734375, "completions/mean_terminated_length": 1557.857177734375, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 0.2632398753894081, "grad_norm": 0.5663595795631409, "kl": 0.03528137691318989, "learning_rate": 1.90125e-06, "loss": 0.0162, "num_tokens": 21334890.0, "reward": 1.4614077806472778, "reward_std": 0.06879691779613495, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46140775084495544, "rewards/correct_reward_func/std": 0.12173257023096085, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 1575.3095703125, "completions/mean_terminated_length": 1495.59033203125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.26479750778816197, "grad_norm": 0.5297430753707886, "kl": 0.03137396089732647, "learning_rate": 1.900625e-06, "loss": 0.0552, "num_tokens": 21473114.0, "reward": 1.4736964702606201, "reward_std": 0.08518790453672409, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47369638085365295, "rewards/correct_reward_func/std": 0.15848514437675476, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2487.0, "completions/max_terminated_length": 2487.0, "completions/mean_length": 1467.511962890625, "completions/mean_terminated_length": 1467.511962890625, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 0.26635514018691586, "grad_norm": 0.5714218020439148, "kl": 0.03465087711811066, "learning_rate": 1.8999999999999998e-06, "loss": -0.0139, "num_tokens": 21602295.0, "reward": 1.4387915134429932, "reward_std": 0.07327855378389359, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.438791424036026, "rewards/correct_reward_func/std": 0.14280220866203308, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2169.0, "completions/max_terminated_length": 2169.0, "completions/mean_length": 1464.6190185546875, "completions/mean_terminated_length": 1464.6190185546875, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 0.26791277258566976, "grad_norm": 0.5421945452690125, "kl": 0.034480318427085876, "learning_rate": 1.899375e-06, "loss": 0.0296, "num_tokens": 21731377.0, "reward": 1.4377334117889404, "reward_std": 0.0828586295247078, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4496382176876068, "rewards/correct_reward_func/std": 0.13945025205612183, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 1374.34521484375, "completions/mean_terminated_length": 1374.34521484375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.26947040498442365, "grad_norm": 0.6028652191162109, "kl": 0.03403773531317711, "learning_rate": 1.8987499999999998e-06, "loss": -0.0284, "num_tokens": 21852672.0, "reward": 1.4843207597732544, "reward_std": 0.0630205050110817, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4843207001686096, "rewards/correct_reward_func/std": 0.14204466342926025, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 1529.8809814453125, "completions/mean_terminated_length": 1529.8809814453125, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.27102803738317754, "grad_norm": 0.5906941890716553, "kl": 0.034525854513049126, "learning_rate": 1.898125e-06, "loss": 0.0167, "num_tokens": 21987254.0, "reward": 1.3962242603302002, "reward_std": 0.0881531834602356, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4081288278102875, "rewards/correct_reward_func/std": 0.08537304401397705, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3739.0, "completions/max_terminated_length": 3739.0, "completions/mean_length": 1501.166748046875, "completions/mean_terminated_length": 1501.166748046875, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.27258566978193144, "grad_norm": 0.5807496905326843, "kl": 0.03358875773847103, "learning_rate": 1.8974999999999998e-06, "loss": 0.0106, "num_tokens": 22119274.0, "reward": 1.4818309545516968, "reward_std": 0.08105891197919846, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4818309247493744, "rewards/correct_reward_func/std": 0.1239246129989624, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 1430.09521484375, "completions/mean_terminated_length": 1430.09521484375, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.27414330218068533, "grad_norm": 0.5961090922355652, "kl": 0.03483774699270725, "learning_rate": 1.896875e-06, "loss": -0.0327, "num_tokens": 22245432.0, "reward": 1.4600034952163696, "reward_std": 0.08812181651592255, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4719081223011017, "rewards/correct_reward_func/std": 0.1560893952846527, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 1490.65478515625, "completions/mean_terminated_length": 1409.9156494140625, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 0.2757009345794392, "grad_norm": 0.5829291343688965, "kl": 0.033502571284770966, "learning_rate": 1.8962499999999998e-06, "loss": 0.0753, "num_tokens": 22376581.0, "reward": 1.4762822389602661, "reward_std": 0.09881778061389923, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4762822091579437, "rewards/correct_reward_func/std": 0.15524689853191376, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 1661.4405517578125, "completions/mean_terminated_length": 1502.158447265625, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 0.2772585669781931, "grad_norm": 0.49473848938941956, "kl": 0.031944600865244865, "learning_rate": 1.8956249999999997e-06, "loss": 0.1143, "num_tokens": 22522250.0, "reward": 1.4583110809326172, "reward_std": 0.07724699378013611, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4583110809326172, "rewards/correct_reward_func/std": 0.15963733196258545, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 1503.09521484375, "completions/mean_terminated_length": 1503.09521484375, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.278816199376947, "grad_norm": 0.5652268528938293, "kl": 0.033452507108449936, "learning_rate": 1.8949999999999999e-06, "loss": -0.0117, "num_tokens": 22654690.0, "reward": 1.539380431175232, "reward_std": 0.07203835994005203, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5393803119659424, "rewards/correct_reward_func/std": 0.15730725228786469, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1424.7381591796875, "completions/mean_terminated_length": 1424.7381591796875, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 0.2803738317757009, "grad_norm": 0.6266071796417236, "kl": 0.03748060762882233, "learning_rate": 1.8943749999999998e-06, "loss": 0.0132, "num_tokens": 22780308.0, "reward": 1.4268115758895874, "reward_std": 0.06137494370341301, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4268115758895874, "rewards/correct_reward_func/std": 0.1579686850309372, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 1513.3214111328125, "completions/mean_terminated_length": 1432.8553466796875, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 0.2819314641744548, "grad_norm": 0.5787291526794434, "kl": 0.03337083198130131, "learning_rate": 1.8937499999999999e-06, "loss": 0.0331, "num_tokens": 22913361.0, "reward": 1.4345508813858032, "reward_std": 0.08300718665122986, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43455079197883606, "rewards/correct_reward_func/std": 0.12243600934743881, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2425.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 1513.0238037109375, "completions/mean_terminated_length": 1513.0238037109375, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.2834890965732087, "grad_norm": 0.5958402752876282, "kl": 0.0339033342897892, "learning_rate": 1.8931249999999998e-06, "loss": -0.0335, "num_tokens": 23046509.0, "reward": 1.4896963834762573, "reward_std": 0.074093759059906, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4896962344646454, "rewards/correct_reward_func/std": 0.18158133327960968, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 1505.4405517578125, "completions/mean_terminated_length": 1505.4405517578125, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 0.2850467289719626, "grad_norm": 0.5892803072929382, "kl": 0.03511413745582104, "learning_rate": 1.8924999999999999e-06, "loss": -0.0281, "num_tokens": 23179050.0, "reward": 1.4773499965667725, "reward_std": 0.06594084203243256, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4773499667644501, "rewards/correct_reward_func/std": 0.12919507920742035, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1522.84521484375, "completions/mean_terminated_length": 1442.493896484375, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 0.2866043613707165, "grad_norm": 0.5434836149215698, "kl": 0.034950753673911095, "learning_rate": 1.891875e-06, "loss": 0.0603, "num_tokens": 23312909.0, "reward": 1.4319921731948853, "reward_std": 0.09058649092912674, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4438968300819397, "rewards/correct_reward_func/std": 0.1126542016863823, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 1422.1190185546875, "completions/mean_terminated_length": 1422.1190185546875, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 0.2881619937694704, "grad_norm": 0.6052145957946777, "kl": 0.03563849255442619, "learning_rate": 1.89125e-06, "loss": -0.0224, "num_tokens": 23438439.0, "reward": 1.508691668510437, "reward_std": 0.06740865856409073, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5086915493011475, "rewards/correct_reward_func/std": 0.1557309627532959, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1442.6905517578125, "completions/mean_terminated_length": 1442.6905517578125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.2897196261682243, "grad_norm": 0.5826404094696045, "kl": 0.03599457070231438, "learning_rate": 1.890625e-06, "loss": 0.022, "num_tokens": 23565397.0, "reward": 1.44635009765625, "reward_std": 0.06368335336446762, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44635000824928284, "rewards/correct_reward_func/std": 0.14564184844493866, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2051.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 1438.2381591796875, "completions/mean_terminated_length": 1438.2381591796875, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.29127725856697817, "grad_norm": 0.597940981388092, "kl": 0.0359827596694231, "learning_rate": 1.89e-06, "loss": -0.0061, "num_tokens": 23691981.0, "reward": 1.4225126504898071, "reward_std": 0.07259950041770935, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4225126802921295, "rewards/correct_reward_func/std": 0.14631718397140503, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 1509.1785888671875, "completions/mean_terminated_length": 1509.1785888671875, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.29283489096573206, "grad_norm": 0.5778841972351074, "kl": 0.03521919064223766, "learning_rate": 1.889375e-06, "loss": -0.0273, "num_tokens": 23824830.0, "reward": 1.4619708061218262, "reward_std": 0.07388392835855484, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46197065711021423, "rewards/correct_reward_func/std": 0.13990521430969238, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1537.8333740234375, "completions/mean_terminated_length": 1457.66259765625, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 0.29439252336448596, "grad_norm": 0.5850031971931458, "kl": 0.036018045619130135, "learning_rate": 1.88875e-06, "loss": 0.078, "num_tokens": 23959864.0, "reward": 1.4022135734558105, "reward_std": 0.06926076114177704, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.402213454246521, "rewards/correct_reward_func/std": 0.15882909297943115, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 1444.0833740234375, "completions/mean_terminated_length": 1444.0833740234375, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.29595015576323985, "grad_norm": 0.6393625736236572, "kl": 0.03580236993730068, "learning_rate": 1.888125e-06, "loss": -0.0305, "num_tokens": 24087125.0, "reward": 1.4801563024520874, "reward_std": 0.0749017521739006, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4801561236381531, "rewards/correct_reward_func/std": 0.1525711715221405, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 1454.8929443359375, "completions/mean_terminated_length": 1454.8929443359375, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "epoch": 0.29750778816199375, "grad_norm": 0.6086553931236267, "kl": 0.036950401961803436, "learning_rate": 1.8875e-06, "loss": -0.0006, "num_tokens": 24215414.0, "reward": 1.4507030248641968, "reward_std": 0.07029537856578827, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4507029950618744, "rewards/correct_reward_func/std": 0.1585504561662674, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 1376.3690185546875, "completions/mean_terminated_length": 1376.3690185546875, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.29906542056074764, "grad_norm": 0.6285943984985352, "kl": 0.03829081356525421, "learning_rate": 1.886875e-06, "loss": 0.015, "num_tokens": 24336927.0, "reward": 1.422336459159851, "reward_std": 0.06646425276994705, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42233631014823914, "rewards/correct_reward_func/std": 0.12761962413787842, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 1498.59521484375, "completions/mean_terminated_length": 1417.9517822265625, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.30062305295950154, "grad_norm": 0.5861801505088806, "kl": 0.036135466769337654, "learning_rate": 1.88625e-06, "loss": 0.0616, "num_tokens": 24468491.0, "reward": 1.4807928800582886, "reward_std": 0.07590245455503464, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.480792760848999, "rewards/correct_reward_func/std": 0.18916194140911102, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 1527.6785888671875, "completions/mean_terminated_length": 1527.6785888671875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.30218068535825543, "grad_norm": 0.5360192656517029, "kl": 0.03715986758470535, "learning_rate": 1.885625e-06, "loss": 0.0182, "num_tokens": 24602846.0, "reward": 1.4494075775146484, "reward_std": 0.06413312256336212, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44940757751464844, "rewards/correct_reward_func/std": 0.14744633436203003, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1480.047607421875, "completions/mean_terminated_length": 1480.047607421875, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 0.3037383177570093, "grad_norm": 0.6489185094833374, "kl": 0.037878649309277534, "learning_rate": 1.885e-06, "loss": -0.0068, "num_tokens": 24733212.0, "reward": 1.4438209533691406, "reward_std": 0.05984492227435112, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44382089376449585, "rewards/correct_reward_func/std": 0.11844155192375183, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 1502.5714111328125, "completions/mean_terminated_length": 1502.5714111328125, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 0.3052959501557632, "grad_norm": 0.6016018390655518, "kl": 0.037760429084300995, "learning_rate": 1.8843749999999999e-06, "loss": -0.0091, "num_tokens": 24865398.0, "reward": 1.4562833309173584, "reward_std": 0.0823647603392601, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46818795800209045, "rewards/correct_reward_func/std": 0.15202713012695312, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 1469.3214111328125, "completions/mean_terminated_length": 1469.3214111328125, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 0.3068535825545171, "grad_norm": 0.5995698571205139, "kl": 0.03654175065457821, "learning_rate": 1.88375e-06, "loss": 0.027, "num_tokens": 24995049.0, "reward": 1.504838466644287, "reward_std": 0.07503201067447662, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5048382878303528, "rewards/correct_reward_func/std": 0.14492768049240112, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2287.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 1468.3214111328125, "completions/mean_terminated_length": 1468.3214111328125, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.308411214953271, "grad_norm": 0.5627298951148987, "kl": 0.03755324147641659, "learning_rate": 1.8831249999999999e-06, "loss": 0.0006, "num_tokens": 25124256.0, "reward": 1.4946554899215698, "reward_std": 0.07795637100934982, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4946552515029907, "rewards/correct_reward_func/std": 0.18591701984405518, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4446.0, "completions/max_terminated_length": 4446.0, "completions/mean_length": 1489.4761962890625, "completions/mean_terminated_length": 1489.4761962890625, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.3099688473520249, "grad_norm": 0.5919142365455627, "kl": 0.03832128271460533, "learning_rate": 1.8825e-06, "loss": 0.0226, "num_tokens": 25255240.0, "reward": 1.4337986707687378, "reward_std": 0.13976813852787018, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.457608163356781, "rewards/correct_reward_func/std": 0.16218747198581696, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1443.702392578125, "completions/mean_terminated_length": 1443.702392578125, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 0.3115264797507788, "grad_norm": 0.6555087566375732, "kl": 0.0495732706040144, "learning_rate": 1.8818749999999999e-06, "loss": 0.0299, "num_tokens": 25382517.0, "reward": 1.4128016233444214, "reward_std": 0.0843491479754448, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4247063994407654, "rewards/correct_reward_func/std": 0.15398363769054413, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 1525.0714111328125, "completions/mean_terminated_length": 1525.0714111328125, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 0.3130841121495327, "grad_norm": 0.6016954779624939, "kl": 0.03820333816111088, "learning_rate": 1.88125e-06, "loss": 0.0094, "num_tokens": 25516815.0, "reward": 1.4183402061462402, "reward_std": 0.08215481042861938, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4421497583389282, "rewards/correct_reward_func/std": 0.12978559732437134, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 1495.3095703125, "completions/mean_terminated_length": 1495.3095703125, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.3146417445482866, "grad_norm": 0.6088923811912537, "kl": 0.039613427594304085, "learning_rate": 1.8806249999999999e-06, "loss": 0.0236, "num_tokens": 25648445.0, "reward": 1.4182132482528687, "reward_std": 0.0829334408044815, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4301179349422455, "rewards/correct_reward_func/std": 0.14303255081176758, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1409.511962890625, "completions/mean_terminated_length": 1409.511962890625, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 0.3161993769470405, "grad_norm": 0.6218343377113342, "kl": 0.03885827772319317, "learning_rate": 1.8799999999999998e-06, "loss": -0.0285, "num_tokens": 25772772.0, "reward": 1.5313540697097778, "reward_std": 0.06517814844846725, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5313540697097778, "rewards/correct_reward_func/std": 0.16049352288246155, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 1644.90478515625, "completions/mean_terminated_length": 1485.219482421875, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 0.3177570093457944, "grad_norm": 0.608787477016449, "kl": 0.037238216027617455, "learning_rate": 1.879375e-06, "loss": 0.1147, "num_tokens": 25916914.0, "reward": 1.4204800128936768, "reward_std": 0.1299707591533661, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4442894458770752, "rewards/correct_reward_func/std": 0.15315347909927368, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1391.2857666015625, "completions/mean_terminated_length": 1391.2857666015625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.31931464174454827, "grad_norm": 0.6118940114974976, "kl": 0.03947138041257858, "learning_rate": 1.8787499999999998e-06, "loss": -0.0174, "num_tokens": 26039794.0, "reward": 1.4847838878631592, "reward_std": 0.08492320775985718, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48478370904922485, "rewards/correct_reward_func/std": 0.173141211271286, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1503.46435546875, "completions/mean_terminated_length": 1503.46435546875, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "epoch": 0.32087227414330216, "grad_norm": 0.5726230144500732, "kl": 0.03891279548406601, "learning_rate": 1.878125e-06, "loss": 0.0236, "num_tokens": 26172265.0, "reward": 1.4738762378692627, "reward_std": 0.06679094582796097, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4738762378692627, "rewards/correct_reward_func/std": 0.1379547268152237, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 1410.6785888671875, "completions/mean_terminated_length": 1410.6785888671875, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.32242990654205606, "grad_norm": 0.6023487448692322, "kl": 0.04066877439618111, "learning_rate": 1.8774999999999998e-06, "loss": -0.0013, "num_tokens": 26296666.0, "reward": 1.4893161058425903, "reward_std": 0.06180576980113983, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4893161356449127, "rewards/correct_reward_func/std": 0.13071846961975098, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5342.0, "completions/max_terminated_length": 5342.0, "completions/mean_length": 1561.0833740234375, "completions/mean_terminated_length": 1561.0833740234375, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.32398753894080995, "grad_norm": 0.5445609092712402, "kl": 0.03924229182302952, "learning_rate": 1.876875e-06, "loss": 0.0128, "num_tokens": 26433857.0, "reward": 1.4545553922653198, "reward_std": 0.047772545367479324, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4545552432537079, "rewards/correct_reward_func/std": 0.12593813240528107, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1578.8690185546875, "completions/mean_terminated_length": 1499.1927490234375, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.32554517133956384, "grad_norm": 0.6344649791717529, "kl": 0.03905314393341541, "learning_rate": 1.8762499999999998e-06, "loss": 0.0585, "num_tokens": 26572404.0, "reward": 1.366521954536438, "reward_std": 0.1604805886745453, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.4022361636161804, "rewards/correct_reward_func/std": 0.13376381993293762, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 1402.7857666015625, "completions/mean_terminated_length": 1402.7857666015625, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.32710280373831774, "grad_norm": 0.633216381072998, "kl": 0.04154348373413086, "learning_rate": 1.875625e-06, "loss": -0.0066, "num_tokens": 26696082.0, "reward": 1.5711801052093506, "reward_std": 0.07012538611888885, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5711799263954163, "rewards/correct_reward_func/std": 0.11415733397006989, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1505.1190185546875, "completions/mean_terminated_length": 1505.1190185546875, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 0.32866043613707163, "grad_norm": 0.5750628113746643, "kl": 0.040491702035069466, "learning_rate": 1.8749999999999998e-06, "loss": 0.0052, "num_tokens": 26828632.0, "reward": 1.4506397247314453, "reward_std": 0.07769308984279633, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46254438161849976, "rewards/correct_reward_func/std": 0.13594752550125122, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 1535.9761962890625, "completions/mean_terminated_length": 1535.9761962890625, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "epoch": 0.3302180685358255, "grad_norm": 0.5685467720031738, "kl": 0.04025058262050152, "learning_rate": 1.8743749999999997e-06, "loss": -0.0142, "num_tokens": 26963768.0, "reward": 1.5034323930740356, "reward_std": 0.06803149729967117, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5034322142601013, "rewards/correct_reward_func/std": 0.1639764904975891, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 1490.21435546875, "completions/mean_terminated_length": 1490.21435546875, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 0.3317757009345794, "grad_norm": 0.6390542387962341, "kl": 0.0426274798810482, "learning_rate": 1.8737499999999998e-06, "loss": -0.0129, "num_tokens": 27094820.0, "reward": 1.4857133626937866, "reward_std": 0.06765501946210861, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48571324348449707, "rewards/correct_reward_func/std": 0.1724708527326584, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 1452.416748046875, "completions/mean_terminated_length": 1452.416748046875, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.3333333333333333, "grad_norm": 0.629348635673523, "kl": 0.04252097010612488, "learning_rate": 1.8731249999999997e-06, "loss": -0.0138, "num_tokens": 27222709.0, "reward": 1.4405913352966309, "reward_std": 0.05661017820239067, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44059130549430847, "rewards/correct_reward_func/std": 0.10197056829929352, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1461.34521484375, "completions/mean_terminated_length": 1461.34521484375, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 0.3348909657320872, "grad_norm": 0.5977749824523926, "kl": 0.04298440180718899, "learning_rate": 1.8725e-06, "loss": -0.0026, "num_tokens": 27351420.0, "reward": 1.505555272102356, "reward_std": 0.07077483087778091, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5055552124977112, "rewards/correct_reward_func/std": 0.19038523733615875, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 1527.8214111328125, "completions/mean_terminated_length": 1527.8214111328125, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.3364485981308411, "grad_norm": 0.5859149694442749, "kl": 0.043821416795253754, "learning_rate": 1.871875e-06, "loss": 0.0212, "num_tokens": 27485739.0, "reward": 1.4727665185928345, "reward_std": 0.13684409856796265, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4965760409832001, "rewards/correct_reward_func/std": 0.15358960628509521, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 1495.8929443359375, "completions/mean_terminated_length": 1495.8929443359375, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 0.338006230529595, "grad_norm": 0.6052335500717163, "kl": 0.04412714019417763, "learning_rate": 1.87125e-06, "loss": 0.0149, "num_tokens": 27617418.0, "reward": 1.4542310237884521, "reward_std": 0.06858990341424942, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.454230934381485, "rewards/correct_reward_func/std": 0.16202780604362488, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1571.96435546875, "completions/mean_terminated_length": 1571.96435546875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.3395638629283489, "grad_norm": 0.5771605968475342, "kl": 0.043260419741272926, "learning_rate": 1.870625e-06, "loss": -0.0301, "num_tokens": 27755499.0, "reward": 1.4477570056915283, "reward_std": 0.08448237925767899, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44775694608688354, "rewards/correct_reward_func/std": 0.15343379974365234, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 1507.511962890625, "completions/mean_terminated_length": 1507.511962890625, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.3411214953271028, "grad_norm": 0.5883904099464417, "kl": 0.044709596782922745, "learning_rate": 1.87e-06, "loss": -0.0025, "num_tokens": 27888064.0, "reward": 1.4478862285614014, "reward_std": 0.07146090269088745, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.447886198759079, "rewards/correct_reward_func/std": 0.12358597666025162, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 1559.8214111328125, "completions/mean_terminated_length": 1559.8214111328125, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 0.3426791277258567, "grad_norm": 0.5822417140007019, "kl": 0.04406227543950081, "learning_rate": 1.869375e-06, "loss": -0.0193, "num_tokens": 28025029.0, "reward": 1.5140615701675415, "reward_std": 0.10227732360363007, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5259661674499512, "rewards/correct_reward_func/std": 0.19070927798748016, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3088.0, "completions/max_terminated_length": 3088.0, "completions/mean_length": 1581.6309814453125, "completions/mean_terminated_length": 1581.6309814453125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3442367601246106, "grad_norm": 0.6107174754142761, "kl": 0.04346361383795738, "learning_rate": 1.86875e-06, "loss": -0.0145, "num_tokens": 28163856.0, "reward": 1.5085567235946655, "reward_std": 0.08370744436979294, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5085568428039551, "rewards/correct_reward_func/std": 0.14021635055541992, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2224.0, "completions/max_terminated_length": 2224.0, "completions/mean_length": 1555.7261962890625, "completions/mean_terminated_length": 1555.7261962890625, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.34579439252336447, "grad_norm": 0.5877107977867126, "kl": 0.043224770575761795, "learning_rate": 1.868125e-06, "loss": -0.0078, "num_tokens": 28300597.0, "reward": 1.4819693565368652, "reward_std": 0.09113749116659164, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49387404322624207, "rewards/correct_reward_func/std": 0.19690191745758057, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1568.0714111328125, "completions/mean_terminated_length": 1568.0714111328125, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.34735202492211836, "grad_norm": 0.5657203197479248, "kl": 0.04514329880475998, "learning_rate": 1.8675e-06, "loss": -0.0005, "num_tokens": 28438135.0, "reward": 1.417351484298706, "reward_std": 0.08495763689279556, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4173515737056732, "rewards/correct_reward_func/std": 0.13209807872772217, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3149.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 1625.6785888671875, "completions/mean_terminated_length": 1625.6785888671875, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "epoch": 0.34890965732087226, "grad_norm": 0.6052978038787842, "kl": 0.04441903904080391, "learning_rate": 1.866875e-06, "loss": -0.0046, "num_tokens": 28580818.0, "reward": 1.4290771484375, "reward_std": 0.10442011803388596, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44098177552223206, "rewards/correct_reward_func/std": 0.1553770899772644, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 1623.761962890625, "completions/mean_terminated_length": 1623.761962890625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.35046728971962615, "grad_norm": 0.5626906156539917, "kl": 0.04204758256673813, "learning_rate": 1.86625e-06, "loss": 0.0061, "num_tokens": 28723454.0, "reward": 1.496443510055542, "reward_std": 0.06441661715507507, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49644333124160767, "rewards/correct_reward_func/std": 0.1379326730966568, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 1645.666748046875, "completions/mean_terminated_length": 1566.795166015625, "completions/min_length": 1062.0, "completions/min_terminated_length": 1062.0, "epoch": 0.35202492211838005, "grad_norm": 0.568545937538147, "kl": 0.041140057146549225, "learning_rate": 1.865625e-06, "loss": 0.0597, "num_tokens": 28867588.0, "reward": 1.450238585472107, "reward_std": 0.105202816426754, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4621433615684509, "rewards/correct_reward_func/std": 0.13623471558094025, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1559.2261962890625, "completions/mean_terminated_length": 1559.2261962890625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.35358255451713394, "grad_norm": 0.6394890546798706, "kl": 0.04375514201819897, "learning_rate": 1.865e-06, "loss": -0.0027, "num_tokens": 29004347.0, "reward": 1.4285489320755005, "reward_std": 0.04757591709494591, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42854899168014526, "rewards/correct_reward_func/std": 0.12418505549430847, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 1529.46435546875, "completions/mean_terminated_length": 1529.46435546875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.35514018691588783, "grad_norm": 0.6031341552734375, "kl": 0.04356654919683933, "learning_rate": 1.8643749999999998e-06, "loss": -0.0296, "num_tokens": 29138618.0, "reward": 1.5230813026428223, "reward_std": 0.09695133566856384, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5230813026428223, "rewards/correct_reward_func/std": 0.19408383965492249, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1547.21435546875, "completions/mean_terminated_length": 1547.21435546875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.35669781931464173, "grad_norm": 0.5816037654876709, "kl": 0.04396030865609646, "learning_rate": 1.86375e-06, "loss": -0.0083, "num_tokens": 29274332.0, "reward": 1.4719088077545166, "reward_std": 0.07453076541423798, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4719087481498718, "rewards/correct_reward_func/std": 0.17895452678203583, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6889.0, "completions/max_terminated_length": 6889.0, "completions/mean_length": 1678.75, "completions/mean_terminated_length": 1678.75, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.3582554517133956, "grad_norm": 0.5625508427619934, "kl": 0.04273660108447075, "learning_rate": 1.8631249999999998e-06, "loss": -0.0144, "num_tokens": 29421227.0, "reward": 1.4578830003738403, "reward_std": 0.06554538756608963, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45788294076919556, "rewards/correct_reward_func/std": 0.19993598759174347, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 1663.2857666015625, "completions/mean_terminated_length": 1584.62646484375, "completions/min_length": 1098.0, "completions/min_terminated_length": 1098.0, "epoch": 0.3598130841121495, "grad_norm": 0.6602963209152222, "kl": 0.042186228558421135, "learning_rate": 1.8625e-06, "loss": 0.0662, "num_tokens": 29566793.0, "reward": 1.509660005569458, "reward_std": 0.08291852474212646, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5096598267555237, "rewards/correct_reward_func/std": 0.140364408493042, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 1553.5, "completions/mean_terminated_length": 1553.5, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 0.3613707165109034, "grad_norm": 0.5636409521102905, "kl": 0.0421723909676075, "learning_rate": 1.8618749999999999e-06, "loss": 0.0171, "num_tokens": 29703263.0, "reward": 1.4572166204452515, "reward_std": 0.08119600266218185, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4691213071346283, "rewards/correct_reward_func/std": 0.16932806372642517, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1605.0833740234375, "completions/mean_terminated_length": 1605.0833740234375, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.3629283489096573, "grad_norm": 0.5694032907485962, "kl": 0.042043108493089676, "learning_rate": 1.86125e-06, "loss": 0.0077, "num_tokens": 29844144.0, "reward": 1.4451359510421753, "reward_std": 0.05986570194363594, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44513583183288574, "rewards/correct_reward_func/std": 0.10634031891822815, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 1589.84521484375, "completions/mean_terminated_length": 1589.84521484375, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 0.3644859813084112, "grad_norm": 0.5871843695640564, "kl": 0.04350174590945244, "learning_rate": 1.8606249999999999e-06, "loss": 0.0386, "num_tokens": 29983685.0, "reward": 1.4710910320281982, "reward_std": 0.08404207974672318, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47109100222587585, "rewards/correct_reward_func/std": 0.17535626888275146, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2334.0, "completions/max_terminated_length": 2334.0, "completions/mean_length": 1567.0357666015625, "completions/mean_terminated_length": 1567.0357666015625, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "epoch": 0.3660436137071651, "grad_norm": 0.5786914825439453, "kl": 0.03941281884908676, "learning_rate": 1.86e-06, "loss": 0.0094, "num_tokens": 30121118.0, "reward": 1.4925031661987305, "reward_std": 0.10374214500188828, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5044078826904297, "rewards/correct_reward_func/std": 0.15605290234088898, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 1611.0238037109375, "completions/mean_terminated_length": 1611.0238037109375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.367601246105919, "grad_norm": 0.5391596555709839, "kl": 0.0409199483692646, "learning_rate": 1.8593749999999999e-06, "loss": 0.0334, "num_tokens": 30262456.0, "reward": 1.4530733823776245, "reward_std": 0.06101413816213608, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4530732333660126, "rewards/correct_reward_func/std": 0.17970231175422668, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2376.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 1591.416748046875, "completions/mean_terminated_length": 1591.416748046875, "completions/min_length": 1109.0, "completions/min_terminated_length": 1109.0, "epoch": 0.3691588785046729, "grad_norm": 0.595643937587738, "kl": 0.04340810887515545, "learning_rate": 1.8587499999999998e-06, "loss": 0.0072, "num_tokens": 30402333.0, "reward": 1.46660578250885, "reward_std": 0.06254373490810394, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46660569310188293, "rewards/correct_reward_func/std": 0.12236826121807098, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1531.9881591796875, "completions/mean_terminated_length": 1531.9881591796875, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.3707165109034268, "grad_norm": 0.5581408143043518, "kl": 0.04236830584704876, "learning_rate": 1.8581249999999999e-06, "loss": 0.0098, "num_tokens": 30537140.0, "reward": 1.5432277917861938, "reward_std": 0.0626709833741188, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5432276129722595, "rewards/correct_reward_func/std": 0.14388269186019897, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1550.797607421875, "completions/mean_terminated_length": 1550.797607421875, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 0.37227414330218067, "grad_norm": 0.574525773525238, "kl": 0.04116277024149895, "learning_rate": 1.8574999999999998e-06, "loss": -0.0023, "num_tokens": 30673653.0, "reward": 1.4574679136276245, "reward_std": 0.06034516915678978, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4574679434299469, "rewards/correct_reward_func/std": 0.12509454786777496, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1503.8929443359375, "completions/mean_terminated_length": 1503.8929443359375, "completions/min_length": 1007.0, "completions/min_terminated_length": 1007.0, "epoch": 0.37383177570093457, "grad_norm": 0.5814647674560547, "kl": 0.04194348491728306, "learning_rate": 1.856875e-06, "loss": 0.0009, "num_tokens": 30806058.0, "reward": 1.4536018371582031, "reward_std": 0.0663735568523407, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45360174775123596, "rewards/correct_reward_func/std": 0.11585589498281479, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1468.6785888671875, "completions/mean_terminated_length": 1468.6785888671875, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.37538940809968846, "grad_norm": 0.5900517702102661, "kl": 0.041691072285175323, "learning_rate": 1.8562499999999998e-06, "loss": 0.0052, "num_tokens": 30935385.0, "reward": 1.4337111711502075, "reward_std": 0.0778563842177391, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4456157684326172, "rewards/correct_reward_func/std": 0.13722439110279083, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 1410.9405517578125, "completions/mean_terminated_length": 1410.9405517578125, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.37694704049844235, "grad_norm": 0.617452085018158, "kl": 0.04172271862626076, "learning_rate": 1.855625e-06, "loss": 0.0126, "num_tokens": 31059784.0, "reward": 1.4357478618621826, "reward_std": 0.08216311782598495, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44765254855155945, "rewards/correct_reward_func/std": 0.13791020214557648, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2206.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 1451.5833740234375, "completions/mean_terminated_length": 1451.5833740234375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.37850467289719625, "grad_norm": 0.5894716382026672, "kl": 0.040936123579740524, "learning_rate": 1.8549999999999998e-06, "loss": 0.0101, "num_tokens": 31187819.0, "reward": 1.533345341682434, "reward_std": 0.07711285352706909, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5333453416824341, "rewards/correct_reward_func/std": 0.12535390257835388, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 1480.15478515625, "completions/mean_terminated_length": 1480.15478515625, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 0.38006230529595014, "grad_norm": 0.613828182220459, "kl": 0.043313439935445786, "learning_rate": 1.854375e-06, "loss": -0.0173, "num_tokens": 31318278.0, "reward": 1.4279972314834595, "reward_std": 0.06592278927564621, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4279972314834595, "rewards/correct_reward_func/std": 0.1297278255224228, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 1363.1905517578125, "completions/mean_terminated_length": 1363.1905517578125, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 0.38161993769470404, "grad_norm": 0.6236963272094727, "kl": 0.04276050627231598, "learning_rate": 1.8537499999999998e-06, "loss": 0.0313, "num_tokens": 31438636.0, "reward": 1.4753836393356323, "reward_std": 0.09151271730661392, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48728832602500916, "rewards/correct_reward_func/std": 0.1840963512659073, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1526.7857666015625, "completions/mean_terminated_length": 1446.48193359375, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 0.38317757009345793, "grad_norm": 0.5810291171073914, "kl": 0.04004097357392311, "learning_rate": 1.8531249999999997e-06, "loss": 0.0546, "num_tokens": 31573048.0, "reward": 1.4642269611358643, "reward_std": 0.07142822444438934, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46422696113586426, "rewards/correct_reward_func/std": 0.13529928028583527, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1418.5595703125, "completions/mean_terminated_length": 1418.5595703125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.3847352024922118, "grad_norm": 0.594132125377655, "kl": 0.04153658635914326, "learning_rate": 1.8525e-06, "loss": 0.0037, "num_tokens": 31698363.0, "reward": 1.4876208305358887, "reward_std": 0.05413410812616348, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4876207709312439, "rewards/correct_reward_func/std": 0.134954035282135, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 1418.4285888671875, "completions/mean_terminated_length": 1418.4285888671875, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.3862928348909657, "grad_norm": 0.6255624890327454, "kl": 0.04057171009480953, "learning_rate": 1.851875e-06, "loss": 0.0215, "num_tokens": 31823583.0, "reward": 1.5005978345870972, "reward_std": 0.07266637682914734, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5125024914741516, "rewards/correct_reward_func/std": 0.1100957989692688, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 1451.107177734375, "completions/mean_terminated_length": 1451.107177734375, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.3878504672897196, "grad_norm": 0.5847201943397522, "kl": 0.04207037389278412, "learning_rate": 1.85125e-06, "loss": 0.0091, "num_tokens": 31951524.0, "reward": 1.4142539501190186, "reward_std": 0.05806390568614006, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41425377130508423, "rewards/correct_reward_func/std": 0.11380590498447418, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 1413.1905517578125, "completions/mean_terminated_length": 1413.1905517578125, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.3894080996884735, "grad_norm": 0.6250362396240234, "kl": 0.04115402512252331, "learning_rate": 1.850625e-06, "loss": -0.0033, "num_tokens": 32076256.0, "reward": 1.4924920797348022, "reward_std": 0.08884865790605545, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5043967962265015, "rewards/correct_reward_func/std": 0.13221827149391174, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 1406.2261962890625, "completions/mean_terminated_length": 1406.2261962890625, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.3909657320872274, "grad_norm": 0.6414059996604919, "kl": 0.0399419330060482, "learning_rate": 1.85e-06, "loss": 0.0096, "num_tokens": 32200295.0, "reward": 1.4972379207611084, "reward_std": 0.07513421773910522, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4972379803657532, "rewards/correct_reward_func/std": 0.14758117496967316, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3084.0, "completions/max_terminated_length": 3084.0, "completions/mean_length": 1468.4285888671875, "completions/mean_terminated_length": 1468.4285888671875, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "epoch": 0.3925233644859813, "grad_norm": 0.5872684717178345, "kl": 0.04071245715022087, "learning_rate": 1.849375e-06, "loss": -0.0157, "num_tokens": 32329739.0, "reward": 1.4793757200241089, "reward_std": 0.07921571284532547, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4793757498264313, "rewards/correct_reward_func/std": 0.14062678813934326, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 1309.952392578125, "completions/mean_terminated_length": 1309.952392578125, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.3940809968847352, "grad_norm": 0.6398329138755798, "kl": 0.042467374354600906, "learning_rate": 1.8487499999999999e-06, "loss": -0.001, "num_tokens": 32445493.0, "reward": 1.4916237592697144, "reward_std": 0.10208263248205185, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5035284757614136, "rewards/correct_reward_func/std": 0.2051754891872406, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 1518.261962890625, "completions/mean_terminated_length": 1437.8553466796875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.3956386292834891, "grad_norm": 0.6442155838012695, "kl": 0.04048139229416847, "learning_rate": 1.848125e-06, "loss": 0.0944, "num_tokens": 32579225.0, "reward": 1.4657591581344604, "reward_std": 0.10077626258134842, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4776638448238373, "rewards/correct_reward_func/std": 0.15037497878074646, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2150.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 1396.46435546875, "completions/mean_terminated_length": 1396.46435546875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.397196261682243, "grad_norm": 0.5844140648841858, "kl": 0.03995479829609394, "learning_rate": 1.8474999999999999e-06, "loss": -0.0318, "num_tokens": 32702402.0, "reward": 1.4533027410507202, "reward_std": 0.11024706810712814, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4652075171470642, "rewards/correct_reward_func/std": 0.15434937179088593, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 1392.0357666015625, "completions/mean_terminated_length": 1392.0357666015625, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 0.3987538940809969, "grad_norm": 0.6156937479972839, "kl": 0.04135890118777752, "learning_rate": 1.846875e-06, "loss": -0.0274, "num_tokens": 32825423.0, "reward": 1.4397926330566406, "reward_std": 0.13189160823822021, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.46360212564468384, "rewards/correct_reward_func/std": 0.11824122816324234, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1323.09521484375, "completions/mean_terminated_length": 1323.09521484375, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.40031152647975077, "grad_norm": 0.6653797626495361, "kl": 0.041794365271925926, "learning_rate": 1.84625e-06, "loss": 0.0155, "num_tokens": 32942683.0, "reward": 1.4287505149841309, "reward_std": 0.10937704890966415, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.45256009697914124, "rewards/correct_reward_func/std": 0.1507876217365265, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 1432.6905517578125, "completions/mean_terminated_length": 1432.6905517578125, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 0.40186915887850466, "grad_norm": 0.5770981311798096, "kl": 0.04339606128633022, "learning_rate": 1.845625e-06, "loss": -0.0066, "num_tokens": 33068987.0, "reward": 1.435407280921936, "reward_std": 0.10868566483259201, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.459216833114624, "rewards/correct_reward_func/std": 0.13378530740737915, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1342.0, "completions/mean_terminated_length": 1342.0, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 0.40342679127725856, "grad_norm": 0.5935827493667603, "kl": 0.0418770182877779, "learning_rate": 1.845e-06, "loss": -0.0092, "num_tokens": 33187595.0, "reward": 1.4948703050613403, "reward_std": 0.06175254285335541, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4948703348636627, "rewards/correct_reward_func/std": 0.14253027737140656, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1342.392822265625, "completions/mean_terminated_length": 1342.392822265625, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 0.40498442367601245, "grad_norm": 0.6045507192611694, "kl": 0.04008128307759762, "learning_rate": 1.844375e-06, "loss": 0.017, "num_tokens": 33306416.0, "reward": 1.4627269506454468, "reward_std": 0.11455470323562622, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.48653644323349, "rewards/correct_reward_func/std": 0.13522200286388397, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2129.0, "completions/max_terminated_length": 2129.0, "completions/mean_length": 1361.892822265625, "completions/mean_terminated_length": 1361.892822265625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.40654205607476634, "grad_norm": 0.6531806588172913, "kl": 0.043158069252967834, "learning_rate": 1.84375e-06, "loss": -0.0002, "num_tokens": 33426683.0, "reward": 1.4994860887527466, "reward_std": 0.056075319647789, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4994860291481018, "rewards/correct_reward_func/std": 0.13296645879745483, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2196.0, "completions/max_terminated_length": 2196.0, "completions/mean_length": 1350.40478515625, "completions/mean_terminated_length": 1350.40478515625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.40809968847352024, "grad_norm": 0.6397004723548889, "kl": 0.042096974328160286, "learning_rate": 1.8431249999999998e-06, "loss": 0.0194, "num_tokens": 33546147.0, "reward": 1.5182113647460938, "reward_std": 0.09466809034347534, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5420209169387817, "rewards/correct_reward_func/std": 0.1314767450094223, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 1378.107177734375, "completions/mean_terminated_length": 1378.107177734375, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 0.40965732087227413, "grad_norm": 0.6153465509414673, "kl": 0.04178653843700886, "learning_rate": 1.8425e-06, "loss": 0.0061, "num_tokens": 33667932.0, "reward": 1.5274070501327515, "reward_std": 0.06379646062850952, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5274069309234619, "rewards/correct_reward_func/std": 0.11842171102762222, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1360.357177734375, "completions/mean_terminated_length": 1360.357177734375, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 0.411214953271028, "grad_norm": 0.6154939532279968, "kl": 0.04235922172665596, "learning_rate": 1.8418749999999998e-06, "loss": 0.0078, "num_tokens": 33788220.0, "reward": 1.4856061935424805, "reward_std": 0.09970905631780624, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5094156861305237, "rewards/correct_reward_func/std": 0.11878927052021027, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2242.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 1453.40478515625, "completions/mean_terminated_length": 1453.40478515625, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.4127725856697819, "grad_norm": 0.5895340442657471, "kl": 0.041117291897535324, "learning_rate": 1.84125e-06, "loss": 0.0121, "num_tokens": 33916336.0, "reward": 1.5335173606872559, "reward_std": 0.06757655739784241, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5335173010826111, "rewards/correct_reward_func/std": 0.15610700845718384, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 1328.607177734375, "completions/mean_terminated_length": 1328.607177734375, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "epoch": 0.4143302180685358, "grad_norm": 0.608429491519928, "kl": 0.04289627820253372, "learning_rate": 1.8406249999999998e-06, "loss": 0.0158, "num_tokens": 34033747.0, "reward": 1.4430692195892334, "reward_std": 0.05843156576156616, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.443069189786911, "rewards/correct_reward_func/std": 0.1901571899652481, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2294.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 1355.8809814453125, "completions/mean_terminated_length": 1355.8809814453125, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.4158878504672897, "grad_norm": 0.6520563960075378, "kl": 0.04428970441222191, "learning_rate": 1.84e-06, "loss": 0.0026, "num_tokens": 34153503.0, "reward": 1.441379427909851, "reward_std": 0.11828587204217911, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4651888906955719, "rewards/correct_reward_func/std": 0.1112525463104248, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1398.202392578125, "completions/mean_terminated_length": 1398.202392578125, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.4174454828660436, "grad_norm": 0.6122962236404419, "kl": 0.043515296652913094, "learning_rate": 1.8393749999999999e-06, "loss": -0.0184, "num_tokens": 34277012.0, "reward": 1.4698716402053833, "reward_std": 0.10509771853685379, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4936811327934265, "rewards/correct_reward_func/std": 0.1476019024848938, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3903.0, "completions/max_terminated_length": 3903.0, "completions/mean_length": 1497.8929443359375, "completions/mean_terminated_length": 1497.8929443359375, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 0.4190031152647975, "grad_norm": 0.5924271941184998, "kl": 0.04205969721078873, "learning_rate": 1.83875e-06, "loss": 0.0098, "num_tokens": 34408937.0, "reward": 1.509368896484375, "reward_std": 0.09606263041496277, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5212737321853638, "rewards/correct_reward_func/std": 0.1800881028175354, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 1369.9285888671875, "completions/mean_terminated_length": 1369.9285888671875, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.4205607476635514, "grad_norm": 0.6304810047149658, "kl": 0.04325720854103565, "learning_rate": 1.8381249999999999e-06, "loss": 0.0025, "num_tokens": 34530017.0, "reward": 1.5110054016113281, "reward_std": 0.0600060299038887, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5110054016113281, "rewards/correct_reward_func/std": 0.1884339600801468, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 1406.3214111328125, "completions/mean_terminated_length": 1406.3214111328125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.4221183800623053, "grad_norm": 0.6104097366333008, "kl": 0.041636811569333076, "learning_rate": 1.8374999999999998e-06, "loss": -0.0099, "num_tokens": 34654028.0, "reward": 1.5112788677215576, "reward_std": 0.07226122170686722, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5112787485122681, "rewards/correct_reward_func/std": 0.16975651681423187, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2353.0, "completions/max_terminated_length": 2353.0, "completions/mean_length": 1444.3690185546875, "completions/mean_terminated_length": 1444.3690185546875, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.4236760124610592, "grad_norm": 0.6154975295066833, "kl": 0.045218756422400475, "learning_rate": 1.8368749999999999e-06, "loss": 0.012, "num_tokens": 34781289.0, "reward": 1.570892333984375, "reward_std": 0.06964144110679626, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.570892333984375, "rewards/correct_reward_func/std": 0.17510028183460236, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2181.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 1431.8333740234375, "completions/mean_terminated_length": 1431.8333740234375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.4252336448598131, "grad_norm": 0.6236024498939514, "kl": 0.04151295870542526, "learning_rate": 1.8362499999999998e-06, "loss": -0.022, "num_tokens": 34907617.0, "reward": 1.4521212577819824, "reward_std": 0.12464414536952972, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47593072056770325, "rewards/correct_reward_func/std": 0.20174725353717804, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2056.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 1358.96435546875, "completions/mean_terminated_length": 1358.96435546875, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.42679127725856697, "grad_norm": 0.6278449892997742, "kl": 0.04468250274658203, "learning_rate": 1.835625e-06, "loss": -0.0027, "num_tokens": 35027644.0, "reward": 1.4580986499786377, "reward_std": 0.10002487152814865, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4700033664703369, "rewards/correct_reward_func/std": 0.13392986357212067, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 1500.6190185546875, "completions/mean_terminated_length": 1500.6190185546875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.42834890965732086, "grad_norm": 0.5943901538848877, "kl": 0.04206756874918938, "learning_rate": 1.8349999999999998e-06, "loss": 0.0157, "num_tokens": 35159816.0, "reward": 1.392418622970581, "reward_std": 0.08610358834266663, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4043233394622803, "rewards/correct_reward_func/std": 0.11100338399410248, "step": 275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1480.7261962890625, "completions/mean_terminated_length": 1480.7261962890625, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 0.42990654205607476, "grad_norm": 0.5723066329956055, "kl": 0.04631072096526623, "learning_rate": 1.834375e-06, "loss": 0.0086, "num_tokens": 35289993.0, "reward": 1.4739741086959839, "reward_std": 0.09586656838655472, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4858788549900055, "rewards/correct_reward_func/std": 0.15505263209342957, "step": 276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 8192.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 1646.666748046875, "completions/mean_terminated_length": 1404.2469482421875, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.43146417445482865, "grad_norm": 0.543669581413269, "kl": 0.03733106330037117, "learning_rate": 1.8337499999999998e-06, "loss": 0.182, "num_tokens": 35434073.0, "reward": 1.4567782878875732, "reward_std": 0.13338518142700195, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46868306398391724, "rewards/correct_reward_func/std": 0.16619150340557098, "step": 277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1501.5595703125, "completions/mean_terminated_length": 1501.5595703125, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.43302180685358255, "grad_norm": 0.6221758723258972, "kl": 0.04269747622311115, "learning_rate": 1.8331249999999997e-06, "loss": 0.0145, "num_tokens": 35566138.0, "reward": 1.478410243988037, "reward_std": 0.0783148929476738, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49031493067741394, "rewards/correct_reward_func/std": 0.1501626968383789, "step": 278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7459.0, "completions/max_terminated_length": 7459.0, "completions/mean_length": 1598.6785888671875, "completions/mean_terminated_length": 1598.6785888671875, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 0.43457943925233644, "grad_norm": 0.5859509706497192, "kl": 0.04193317890167236, "learning_rate": 1.8325e-06, "loss": 0.0074, "num_tokens": 35706385.0, "reward": 1.4401181936264038, "reward_std": 0.0550164058804512, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4401181638240814, "rewards/correct_reward_func/std": 0.1537192016839981, "step": 279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 1499.8095703125, "completions/mean_terminated_length": 1419.1806640625, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.43613707165109034, "grad_norm": 0.595498263835907, "kl": 0.04362577013671398, "learning_rate": 1.831875e-06, "loss": 0.0495, "num_tokens": 35838297.0, "reward": 1.4194005727767944, "reward_std": 0.08994495123624802, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4313051104545593, "rewards/correct_reward_func/std": 0.14579959213733673, "step": 280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3234.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 1566.0357666015625, "completions/mean_terminated_length": 1566.0357666015625, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.43769470404984423, "grad_norm": 0.5680190920829773, "kl": 0.0422314815223217, "learning_rate": 1.83125e-06, "loss": 0.0167, "num_tokens": 35975880.0, "reward": 1.475471019744873, "reward_std": 0.11540813744068146, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.49928048253059387, "rewards/correct_reward_func/std": 0.1275622546672821, "step": 281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 1490.1309814453125, "completions/mean_terminated_length": 1490.1309814453125, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 0.4392523364485981, "grad_norm": 0.5545284152030945, "kl": 0.044208116829395294, "learning_rate": 1.830625e-06, "loss": -0.0119, "num_tokens": 36106913.0, "reward": 1.467574119567871, "reward_std": 0.09451182931661606, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4794788360595703, "rewards/correct_reward_func/std": 0.15235535800457, "step": 282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 1569.4405517578125, "completions/mean_terminated_length": 1569.4405517578125, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 0.440809968847352, "grad_norm": 0.5956711173057556, "kl": 0.04277007095515728, "learning_rate": 1.83e-06, "loss": -0.0127, "num_tokens": 36244560.0, "reward": 1.465549111366272, "reward_std": 0.0775151252746582, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4774538576602936, "rewards/correct_reward_func/std": 0.18763205409049988, "step": 283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 1505.1905517578125, "completions/mean_terminated_length": 1505.1905517578125, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.4423676012461059, "grad_norm": 0.613337516784668, "kl": 0.04313294030725956, "learning_rate": 1.829375e-06, "loss": -0.0096, "num_tokens": 36376960.0, "reward": 1.51215660572052, "reward_std": 0.07286342978477478, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5121565461158752, "rewards/correct_reward_func/std": 0.17705413699150085, "step": 284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 1624.3809814453125, "completions/mean_terminated_length": 1624.3809814453125, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.4439252336448598, "grad_norm": 0.5604744553565979, "kl": 0.04310506582260132, "learning_rate": 1.82875e-06, "loss": 0.008, "num_tokens": 36519414.0, "reward": 1.4455859661102295, "reward_std": 0.07997937500476837, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45749059319496155, "rewards/correct_reward_func/std": 0.15531718730926514, "step": 285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1497.3809814453125, "completions/mean_terminated_length": 1497.3809814453125, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.4454828660436137, "grad_norm": 0.6142125129699707, "kl": 0.04405369609594345, "learning_rate": 1.828125e-06, "loss": 0.0213, "num_tokens": 36651170.0, "reward": 1.4967048168182373, "reward_std": 0.05460391938686371, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49670469760894775, "rewards/correct_reward_func/std": 0.11759886145591736, "step": 286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2114.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 1550.3809814453125, "completions/mean_terminated_length": 1550.3809814453125, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.4470404984423676, "grad_norm": 0.5982187390327454, "kl": 0.043098822236061096, "learning_rate": 1.8274999999999999e-06, "loss": 0.0141, "num_tokens": 36787384.0, "reward": 1.5390734672546387, "reward_std": 0.07396355271339417, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5390734672546387, "rewards/correct_reward_func/std": 0.15539847314357758, "step": 287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 1590.9285888671875, "completions/mean_terminated_length": 1511.3975830078125, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.4485981308411215, "grad_norm": 0.5779162645339966, "kl": 0.04267328046262264, "learning_rate": 1.826875e-06, "loss": 0.0529, "num_tokens": 36926968.0, "reward": 1.4345697164535522, "reward_std": 0.06787623465061188, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4345696270465851, "rewards/correct_reward_func/std": 0.1536804884672165, "step": 288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 1572.46435546875, "completions/mean_terminated_length": 1572.46435546875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.4501557632398754, "grad_norm": 0.5400437116622925, "kl": 0.0447152704000473, "learning_rate": 1.8262499999999999e-06, "loss": -0.0149, "num_tokens": 37065001.0, "reward": 1.4242392778396606, "reward_std": 0.0718853771686554, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42423921823501587, "rewards/correct_reward_func/std": 0.12876558303833008, "step": 289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 1544.71435546875, "completions/mean_terminated_length": 1544.71435546875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.4517133956386293, "grad_norm": 0.5619280338287354, "kl": 0.044344568625092506, "learning_rate": 1.825625e-06, "loss": 0.0099, "num_tokens": 37200853.0, "reward": 1.4639300107955933, "reward_std": 0.1035037636756897, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47583478689193726, "rewards/correct_reward_func/std": 0.17393389344215393, "step": 290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1503.4761962890625, "completions/mean_terminated_length": 1503.4761962890625, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.4532710280373832, "grad_norm": 0.5982638001441956, "kl": 0.04427545331418514, "learning_rate": 1.8249999999999999e-06, "loss": -0.0342, "num_tokens": 37332971.0, "reward": 1.4380909204483032, "reward_std": 0.06856860220432281, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43809083104133606, "rewards/correct_reward_func/std": 0.13657042384147644, "step": 291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1529.8214111328125, "completions/mean_terminated_length": 1529.8214111328125, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.45482866043613707, "grad_norm": 0.6071525812149048, "kl": 0.04384468495845795, "learning_rate": 1.824375e-06, "loss": 0.0026, "num_tokens": 37467494.0, "reward": 1.4335728883743286, "reward_std": 0.04877452179789543, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43357276916503906, "rewards/correct_reward_func/std": 0.14995594322681427, "step": 292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 1446.8690185546875, "completions/mean_terminated_length": 1446.8690185546875, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 0.45638629283489096, "grad_norm": 0.6094366312026978, "kl": 0.04509362578392029, "learning_rate": 1.82375e-06, "loss": 0.0195, "num_tokens": 37594917.0, "reward": 1.5306507349014282, "reward_std": 0.08785208314657211, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5306507349014282, "rewards/correct_reward_func/std": 0.14832548797130585, "step": 293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 1544.011962890625, "completions/mean_terminated_length": 1544.011962890625, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.45794392523364486, "grad_norm": 0.6053863167762756, "kl": 0.04364632070064545, "learning_rate": 1.823125e-06, "loss": -0.0119, "num_tokens": 37730644.0, "reward": 1.4835604429244995, "reward_std": 0.057300373911857605, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4835604131221771, "rewards/correct_reward_func/std": 0.17512056231498718, "step": 294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3292.0, "completions/max_terminated_length": 3292.0, "completions/mean_length": 1579.09521484375, "completions/mean_terminated_length": 1579.09521484375, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.45950155763239875, "grad_norm": 0.5880759954452515, "kl": 0.0431599710136652, "learning_rate": 1.8225e-06, "loss": -0.0029, "num_tokens": 37869492.0, "reward": 1.462117314338684, "reward_std": 0.05300503969192505, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46211737394332886, "rewards/correct_reward_func/std": 0.11961612105369568, "step": 295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1529.5, "completions/mean_terminated_length": 1529.5, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.46105919003115264, "grad_norm": 0.5763446092605591, "kl": 0.04444514401257038, "learning_rate": 1.8218749999999998e-06, "loss": 0.0078, "num_tokens": 38003970.0, "reward": 1.4610320329666138, "reward_std": 0.08918090909719467, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46103209257125854, "rewards/correct_reward_func/std": 0.13395950198173523, "step": 296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 1656.46435546875, "completions/mean_terminated_length": 1577.7227783203125, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 0.46261682242990654, "grad_norm": 0.5834442973136902, "kl": 0.041973644867539406, "learning_rate": 1.82125e-06, "loss": 0.0617, "num_tokens": 38149245.0, "reward": 1.4773956537246704, "reward_std": 0.09391757100820541, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48930031061172485, "rewards/correct_reward_func/std": 0.1633952558040619, "step": 297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1549.3929443359375, "completions/mean_terminated_length": 1549.3929443359375, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 0.46417445482866043, "grad_norm": 0.6317084431648254, "kl": 0.046587640419602394, "learning_rate": 1.8206249999999998e-06, "loss": 0.0175, "num_tokens": 38285280.0, "reward": 1.449130892753601, "reward_std": 0.14152653515338898, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47294026613235474, "rewards/correct_reward_func/std": 0.14357496798038483, "step": 298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 1490.2738037109375, "completions/mean_terminated_length": 1490.2738037109375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4657320872274143, "grad_norm": 0.5916226506233215, "kl": 0.04444451816380024, "learning_rate": 1.82e-06, "loss": -0.0187, "num_tokens": 38416565.0, "reward": 1.4541789293289185, "reward_std": 0.04750651866197586, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45417898893356323, "rewards/correct_reward_func/std": 0.1580149382352829, "step": 299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1604.3333740234375, "completions/mean_terminated_length": 1524.9637451171875, "completions/min_length": 1040.0, "completions/min_terminated_length": 1040.0, "epoch": 0.4672897196261682, "grad_norm": 0.5734702944755554, "kl": 0.0429048128426075, "learning_rate": 1.8193749999999998e-06, "loss": 0.0538, "num_tokens": 38557047.0, "reward": 1.4508525133132935, "reward_std": 0.12774604558944702, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47466206550598145, "rewards/correct_reward_func/std": 0.14542065560817719, "step": 300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2414.0, "completions/max_terminated_length": 2414.0, "completions/mean_length": 1552.4761962890625, "completions/mean_terminated_length": 1552.4761962890625, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.4688473520249221, "grad_norm": 0.596815288066864, "kl": 0.043997010216116905, "learning_rate": 1.81875e-06, "loss": 0.0025, "num_tokens": 38693335.0, "reward": 1.4665279388427734, "reward_std": 0.0923430472612381, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4784325957298279, "rewards/correct_reward_func/std": 0.1467050313949585, "step": 301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5495.0, "completions/max_terminated_length": 5495.0, "completions/mean_length": 1538.1785888671875, "completions/mean_terminated_length": 1538.1785888671875, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "epoch": 0.470404984423676, "grad_norm": 0.5941459536552429, "kl": 0.043290507048368454, "learning_rate": 1.8181249999999999e-06, "loss": 0.0057, "num_tokens": 38828434.0, "reward": 1.5739519596099854, "reward_std": 0.07381974905729294, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5739518404006958, "rewards/correct_reward_func/std": 0.1708972305059433, "step": 302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 1535.1190185546875, "completions/mean_terminated_length": 1535.1190185546875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.4719626168224299, "grad_norm": 0.6311259865760803, "kl": 0.044257769361138344, "learning_rate": 1.8174999999999998e-06, "loss": -0.0331, "num_tokens": 38963522.0, "reward": 1.4050052165985107, "reward_std": 0.12261962890625, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.42881447076797485, "rewards/correct_reward_func/std": 0.15219928324222565, "step": 303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 1518.4405517578125, "completions/mean_terminated_length": 1518.4405517578125, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "epoch": 0.4735202492211838, "grad_norm": 0.5748085379600525, "kl": 0.0418586116284132, "learning_rate": 1.8168749999999999e-06, "loss": -0.0019, "num_tokens": 39097101.0, "reward": 1.4809808731079102, "reward_std": 0.09693938493728638, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4928855299949646, "rewards/correct_reward_func/std": 0.14456793665885925, "step": 304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2232.0, "completions/max_terminated_length": 2232.0, "completions/mean_length": 1455.0595703125, "completions/mean_terminated_length": 1455.0595703125, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.4750778816199377, "grad_norm": 0.6262243986129761, "kl": 0.04568205960094929, "learning_rate": 1.8162499999999998e-06, "loss": 0.005, "num_tokens": 39225266.0, "reward": 1.5073304176330566, "reward_std": 0.0670827329158783, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5073302984237671, "rewards/correct_reward_func/std": 0.12795594334602356, "step": 305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 1483.90478515625, "completions/mean_terminated_length": 1483.90478515625, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 0.4766355140186916, "grad_norm": 0.6198824644088745, "kl": 0.04370650835335255, "learning_rate": 1.8156249999999999e-06, "loss": 0.0063, "num_tokens": 39356004.0, "reward": 1.5359094142913818, "reward_std": 0.04993622750043869, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5359094142913818, "rewards/correct_reward_func/std": 0.13109326362609863, "step": 306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 1559.5, "completions/mean_terminated_length": 1559.5, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.4781931464174455, "grad_norm": 0.5853797197341919, "kl": 0.04342350363731384, "learning_rate": 1.8149999999999998e-06, "loss": 0.0145, "num_tokens": 39493110.0, "reward": 1.4782713651657104, "reward_std": 0.08605591952800751, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49017614126205444, "rewards/correct_reward_func/std": 0.1508086770772934, "step": 307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 1511.09521484375, "completions/mean_terminated_length": 1511.09521484375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.4797507788161994, "grad_norm": 0.5664383769035339, "kl": 0.04395752586424351, "learning_rate": 1.8143749999999999e-06, "loss": 0.0426, "num_tokens": 39625916.0, "reward": 1.4828287363052368, "reward_std": 0.06932734698057175, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48282867670059204, "rewards/correct_reward_func/std": 0.17860376834869385, "step": 308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1484.916748046875, "completions/mean_terminated_length": 1484.916748046875, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.48130841121495327, "grad_norm": 0.617337167263031, "kl": 0.043506965041160583, "learning_rate": 1.8137499999999998e-06, "loss": -0.0018, "num_tokens": 39756451.0, "reward": 1.4927843809127808, "reward_std": 0.10661379992961884, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.50468909740448, "rewards/correct_reward_func/std": 0.1741182804107666, "step": 309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 1531.8214111328125, "completions/mean_terminated_length": 1531.8214111328125, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 0.48286604361370716, "grad_norm": 0.635550320148468, "kl": 0.045443542301654816, "learning_rate": 1.8131250000000001e-06, "loss": -0.0145, "num_tokens": 39891016.0, "reward": 1.4746626615524292, "reward_std": 0.062213968485593796, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4746626019477844, "rewards/correct_reward_func/std": 0.1761779487133026, "step": 310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 1529.7381591796875, "completions/mean_terminated_length": 1529.7381591796875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.48442367601246106, "grad_norm": 0.573308527469635, "kl": 0.04426569677889347, "learning_rate": 1.8125e-06, "loss": -0.0208, "num_tokens": 40025544.0, "reward": 1.506926417350769, "reward_std": 0.07785354554653168, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.506926417350769, "rewards/correct_reward_func/std": 0.15344847738742828, "step": 311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2376.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 1534.297607421875, "completions/mean_terminated_length": 1534.297607421875, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 0.48598130841121495, "grad_norm": 0.5884522199630737, "kl": 0.04365627467632294, "learning_rate": 1.811875e-06, "loss": -0.0051, "num_tokens": 40160329.0, "reward": 1.5241272449493408, "reward_std": 0.08640160411596298, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5241272449493408, "rewards/correct_reward_func/std": 0.1817137748003006, "step": 312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 1530.40478515625, "completions/mean_terminated_length": 1530.40478515625, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.48753894080996885, "grad_norm": 0.6008781790733337, "kl": 0.04319826699793339, "learning_rate": 1.81125e-06, "loss": 0.0087, "num_tokens": 40294919.0, "reward": 1.5073949098587036, "reward_std": 0.06965342164039612, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5073949098587036, "rewards/correct_reward_func/std": 0.17690497636795044, "step": 313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 1450.0, "completions/mean_terminated_length": 1450.0, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.48909657320872274, "grad_norm": 0.728448748588562, "kl": 0.044476715847849846, "learning_rate": 1.810625e-06, "loss": 0.0265, "num_tokens": 40422653.0, "reward": 1.4225661754608154, "reward_std": 0.1585291177034378, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.458280473947525, "rewards/correct_reward_func/std": 0.17805902659893036, "step": 314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1425.8929443359375, "completions/mean_terminated_length": 1425.8929443359375, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.49065420560747663, "grad_norm": 0.6005982160568237, "kl": 0.043727852404117584, "learning_rate": 1.81e-06, "loss": 0.0128, "num_tokens": 40548212.0, "reward": 1.447396993637085, "reward_std": 0.0689636841416359, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4473969638347626, "rewards/correct_reward_func/std": 0.10822274535894394, "step": 315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 1671.4761962890625, "completions/mean_terminated_length": 1592.9156494140625, "completions/min_length": 1111.0, "completions/min_terminated_length": 1111.0, "epoch": 0.49221183800623053, "grad_norm": 0.5339775085449219, "kl": 0.04142884351313114, "learning_rate": 1.809375e-06, "loss": 0.0495, "num_tokens": 40694814.0, "reward": 1.5396287441253662, "reward_std": 0.06713546812534332, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5396286845207214, "rewards/correct_reward_func/std": 0.16389040648937225, "step": 316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1511.6190185546875, "completions/mean_terminated_length": 1511.6190185546875, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "epoch": 0.4937694704049844, "grad_norm": 0.5666081309318542, "kl": 0.04457671754062176, "learning_rate": 1.80875e-06, "loss": -0.0035, "num_tokens": 40827688.0, "reward": 1.4729593992233276, "reward_std": 0.06596960127353668, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47295936942100525, "rewards/correct_reward_func/std": 0.18029561638832092, "step": 317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 1499.1309814453125, "completions/mean_terminated_length": 1499.1309814453125, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.4953271028037383, "grad_norm": 0.5493736863136292, "kl": 0.04210697114467621, "learning_rate": 1.808125e-06, "loss": 0.0096, "num_tokens": 40959555.0, "reward": 1.4608927965164185, "reward_std": 0.05956989526748657, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46089282631874084, "rewards/correct_reward_func/std": 0.13776575028896332, "step": 318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1471.8214111328125, "completions/mean_terminated_length": 1471.8214111328125, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.4968847352024922, "grad_norm": 0.5784661173820496, "kl": 0.04475216940045357, "learning_rate": 1.8075e-06, "loss": 0.003, "num_tokens": 41089194.0, "reward": 1.4597842693328857, "reward_std": 0.06170998513698578, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4597841799259186, "rewards/correct_reward_func/std": 0.132298082113266, "step": 319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 1545.2381591796875, "completions/mean_terminated_length": 1545.2381591796875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.4984423676012461, "grad_norm": 0.5935900211334229, "kl": 0.04243394732475281, "learning_rate": 1.806875e-06, "loss": 0.0376, "num_tokens": 41224964.0, "reward": 1.4219588041305542, "reward_std": 0.07682619988918304, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43386340141296387, "rewards/correct_reward_func/std": 0.117339126765728, "step": 320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 1519.857177734375, "completions/mean_terminated_length": 1519.857177734375, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 0.5, "grad_norm": 0.5818554162979126, "kl": 0.042232925072312355, "learning_rate": 1.8062499999999999e-06, "loss": 0.0097, "num_tokens": 41358608.0, "reward": 1.4776239395141602, "reward_std": 0.04936147853732109, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4776238799095154, "rewards/correct_reward_func/std": 0.10135854780673981, "step": 321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 1553.5357666015625, "completions/mean_terminated_length": 1553.5357666015625, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.5015576323987538, "grad_norm": 0.5444162487983704, "kl": 0.04500117152929306, "learning_rate": 1.805625e-06, "loss": 0.0176, "num_tokens": 41494955.0, "reward": 1.4176223278045654, "reward_std": 0.08014075458049774, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4295269250869751, "rewards/correct_reward_func/std": 0.11872898042201996, "step": 322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 1534.8333740234375, "completions/mean_terminated_length": 1534.8333740234375, "completions/min_length": 1019.0, "completions/min_terminated_length": 1019.0, "epoch": 0.5031152647975078, "grad_norm": 0.6494289040565491, "kl": 0.04654599726200104, "learning_rate": 1.8049999999999999e-06, "loss": -0.0155, "num_tokens": 41630079.0, "reward": 1.550278663635254, "reward_std": 0.0712866261601448, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5502786040306091, "rewards/correct_reward_func/std": 0.11994405835866928, "step": 323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 1490.15478515625, "completions/mean_terminated_length": 1490.15478515625, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.5046728971962616, "grad_norm": 0.5980774760246277, "kl": 0.044281333684921265, "learning_rate": 1.804375e-06, "loss": -0.0166, "num_tokens": 41761258.0, "reward": 1.4597599506378174, "reward_std": 0.06575565785169601, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4597598910331726, "rewards/correct_reward_func/std": 0.13967834413051605, "step": 324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 1490.0833740234375, "completions/mean_terminated_length": 1490.0833740234375, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 0.5062305295950156, "grad_norm": 0.636524498462677, "kl": 0.04570058174431324, "learning_rate": 1.8037499999999999e-06, "loss": 0.0381, "num_tokens": 41892449.0, "reward": 1.473886251449585, "reward_std": 0.08418666571378708, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4738861322402954, "rewards/correct_reward_func/std": 0.11435925960540771, "step": 325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 1402.40478515625, "completions/mean_terminated_length": 1402.40478515625, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 0.5077881619937694, "grad_norm": 0.6131139397621155, "kl": 0.04529164917767048, "learning_rate": 1.803125e-06, "loss": 0.0006, "num_tokens": 42016257.0, "reward": 1.4442152976989746, "reward_std": 0.06880811601877213, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4442150890827179, "rewards/correct_reward_func/std": 0.15375681221485138, "step": 326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1520.6785888671875, "completions/mean_terminated_length": 1440.3011474609375, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 0.5093457943925234, "grad_norm": 0.5682876706123352, "kl": 0.043439922854304314, "learning_rate": 1.8025e-06, "loss": 0.0475, "num_tokens": 42150042.0, "reward": 1.4249800443649292, "reward_std": 0.09845460206270218, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43688473105430603, "rewards/correct_reward_func/std": 0.1302318572998047, "step": 327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1349.4285888671875, "completions/mean_terminated_length": 1349.4285888671875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5109034267912772, "grad_norm": 0.569858729839325, "kl": 0.043210411444306374, "learning_rate": 1.8018749999999998e-06, "loss": -0.0357, "num_tokens": 42269154.0, "reward": 1.5177992582321167, "reward_std": 0.08048205822706223, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5177991986274719, "rewards/correct_reward_func/std": 0.15029731392860413, "step": 328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 1542.7261962890625, "completions/mean_terminated_length": 1462.6143798828125, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 0.5124610591900312, "grad_norm": 0.5456616878509521, "kl": 0.07378330640494823, "learning_rate": 1.80125e-06, "loss": 0.0514, "num_tokens": 42404617.0, "reward": 1.4895997047424316, "reward_std": 0.07681519538164139, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4895995855331421, "rewards/correct_reward_func/std": 0.2211972177028656, "step": 329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1356.261962890625, "completions/mean_terminated_length": 1356.261962890625, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.514018691588785, "grad_norm": 0.6183797121047974, "kl": 0.04728836566209793, "learning_rate": 1.8006249999999998e-06, "loss": -0.0147, "num_tokens": 42524399.0, "reward": 1.4941986799240112, "reward_std": 0.06612447649240494, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49419865012168884, "rewards/correct_reward_func/std": 0.12598052620887756, "step": 330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1400.3333740234375, "completions/mean_terminated_length": 1400.3333740234375, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.5155763239875389, "grad_norm": 0.564521849155426, "kl": 0.04574920795857906, "learning_rate": 1.8e-06, "loss": 0.0249, "num_tokens": 42647847.0, "reward": 1.5129057168960571, "reward_std": 0.055123478174209595, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5129056572914124, "rewards/correct_reward_func/std": 0.14364565908908844, "step": 331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 1540.7857666015625, "completions/mean_terminated_length": 1460.6505126953125, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "epoch": 0.5171339563862928, "grad_norm": 0.6232556700706482, "kl": 0.04752310924232006, "learning_rate": 1.7993749999999998e-06, "loss": 0.0535, "num_tokens": 42783291.0, "reward": 1.4896059036254883, "reward_std": 0.07293432950973511, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4896059036254883, "rewards/correct_reward_func/std": 0.17983748018741608, "step": 332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1505.9881591796875, "completions/mean_terminated_length": 1425.4337158203125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.5186915887850467, "grad_norm": 0.5811641216278076, "kl": 0.043217698112130165, "learning_rate": 1.79875e-06, "loss": 0.022, "num_tokens": 42915812.0, "reward": 1.4960260391235352, "reward_std": 0.0619584396481514, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4960259795188904, "rewards/correct_reward_func/std": 0.13359470665454865, "step": 333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 1459.607177734375, "completions/mean_terminated_length": 1459.607177734375, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.5202492211838006, "grad_norm": 0.5979028344154358, "kl": 0.04545888490974903, "learning_rate": 1.7981249999999998e-06, "loss": 0.0155, "num_tokens": 43044449.0, "reward": 1.4552279710769653, "reward_std": 0.06798920035362244, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45522791147232056, "rewards/correct_reward_func/std": 0.12315916270017624, "step": 334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1457.8690185546875, "completions/mean_terminated_length": 1457.8690185546875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5218068535825545, "grad_norm": 0.5863680839538574, "kl": 0.045693760737776756, "learning_rate": 1.7975e-06, "loss": -0.0156, "num_tokens": 43173012.0, "reward": 1.4975252151489258, "reward_std": 0.0669432058930397, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4975251257419586, "rewards/correct_reward_func/std": 0.13803456723690033, "step": 335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1608.5357666015625, "completions/mean_terminated_length": 1529.216796875, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "epoch": 0.5233644859813084, "grad_norm": 0.556442379951477, "kl": 0.04240516573190689, "learning_rate": 1.7968749999999998e-06, "loss": 0.0712, "num_tokens": 43314111.0, "reward": 1.51832115650177, "reward_std": 0.1056382805109024, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5302258729934692, "rewards/correct_reward_func/std": 0.1863190233707428, "step": 336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 1453.6905517578125, "completions/mean_terminated_length": 1453.6905517578125, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.5249221183800623, "grad_norm": 0.6169154644012451, "kl": 0.04544537514448166, "learning_rate": 1.7962499999999997e-06, "loss": 0.0458, "num_tokens": 43442275.0, "reward": 1.4681013822555542, "reward_std": 0.06039302796125412, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4681013226509094, "rewards/correct_reward_func/std": 0.1258929818868637, "step": 337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 1420.6190185546875, "completions/mean_terminated_length": 1420.6190185546875, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 0.5264797507788161, "grad_norm": 0.5530162453651428, "kl": 0.04460956156253815, "learning_rate": 1.7956249999999999e-06, "loss": -0.0063, "num_tokens": 43567595.0, "reward": 1.4919437170028687, "reward_std": 0.05034913867712021, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49194350838661194, "rewards/correct_reward_func/std": 0.1505471169948578, "step": 338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1373.5238037109375, "completions/mean_terminated_length": 1373.5238037109375, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.5280373831775701, "grad_norm": 0.6410908102989197, "kl": 0.04922908917069435, "learning_rate": 1.7949999999999998e-06, "loss": 0.0041, "num_tokens": 43688893.0, "reward": 1.4642657041549683, "reward_std": 0.047354813665151596, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4642656445503235, "rewards/correct_reward_func/std": 0.1483275443315506, "step": 339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 1476.666748046875, "completions/mean_terminated_length": 1476.666748046875, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.5295950155763239, "grad_norm": 0.5679633021354675, "kl": 0.04452272690832615, "learning_rate": 1.7943749999999999e-06, "loss": 0.0016, "num_tokens": 43819023.0, "reward": 1.4857591390609741, "reward_std": 0.06386592239141464, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48575901985168457, "rewards/correct_reward_func/std": 0.1048179492354393, "step": 340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 1551.3929443359375, "completions/mean_terminated_length": 1471.385498046875, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.5311526479750779, "grad_norm": 0.5662598013877869, "kl": 0.04444094002246857, "learning_rate": 1.79375e-06, "loss": 0.07, "num_tokens": 43955154.0, "reward": 1.4689278602600098, "reward_std": 0.06177349016070366, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4689278304576874, "rewards/correct_reward_func/std": 0.1346137970685959, "step": 341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1437.9285888671875, "completions/mean_terminated_length": 1437.9285888671875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.5327102803738317, "grad_norm": 0.6565880179405212, "kl": 0.04598667845129967, "learning_rate": 1.793125e-06, "loss": -0.0001, "num_tokens": 44081778.0, "reward": 1.5010194778442383, "reward_std": 0.06478109210729599, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5010193586349487, "rewards/correct_reward_func/std": 0.18090546131134033, "step": 342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 1543.09521484375, "completions/mean_terminated_length": 1462.9879150390625, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.5342679127725857, "grad_norm": 0.6114045977592468, "kl": 0.043492890894412994, "learning_rate": 1.7925e-06, "loss": 0.0833, "num_tokens": 44217458.0, "reward": 1.45277738571167, "reward_std": 0.11382251977920532, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47658684849739075, "rewards/correct_reward_func/std": 0.16254591941833496, "step": 343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 1516.1429443359375, "completions/mean_terminated_length": 1435.7108154296875, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 0.5358255451713395, "grad_norm": 0.5839765667915344, "kl": 0.04525020532310009, "learning_rate": 1.791875e-06, "loss": 0.0436, "num_tokens": 44350790.0, "reward": 1.4360476732254028, "reward_std": 0.06001214683055878, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43604764342308044, "rewards/correct_reward_func/std": 0.1315266191959381, "step": 344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 1429.1785888671875, "completions/mean_terminated_length": 1429.1785888671875, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 0.5373831775700935, "grad_norm": 0.567746639251709, "kl": 0.04449603334069252, "learning_rate": 1.79125e-06, "loss": 0.0274, "num_tokens": 44476895.0, "reward": 1.4312893152236938, "reward_std": 0.0564139224588871, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43128931522369385, "rewards/correct_reward_func/std": 0.13854992389678955, "step": 345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1476.25, "completions/mean_terminated_length": 1395.3372802734375, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.5389408099688473, "grad_norm": 0.5948997735977173, "kl": 0.04478558525443077, "learning_rate": 1.790625e-06, "loss": 0.048, "num_tokens": 44606786.0, "reward": 1.4746853113174438, "reward_std": 0.08000284433364868, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4746852219104767, "rewards/correct_reward_func/std": 0.1414322406053543, "step": 346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 1520.8929443359375, "completions/mean_terminated_length": 1440.51806640625, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.5404984423676013, "grad_norm": 0.5873611569404602, "kl": 0.04373046010732651, "learning_rate": 1.79e-06, "loss": 0.0608, "num_tokens": 44740553.0, "reward": 1.3899791240692139, "reward_std": 0.1021641418337822, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4018838405609131, "rewards/correct_reward_func/std": 0.10966146737337112, "step": 347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3150.0, "completions/mean_length": 1603.3809814453125, "completions/mean_terminated_length": 1524.0, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 0.5420560747663551, "grad_norm": 0.6007145643234253, "kl": 0.042141517624258995, "learning_rate": 1.789375e-06, "loss": 0.0734, "num_tokens": 44881519.0, "reward": 1.516649842262268, "reward_std": 0.06937997788190842, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5166497230529785, "rewards/correct_reward_func/std": 0.13787348568439484, "step": 348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1627.0357666015625, "completions/mean_terminated_length": 1466.91455078125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.543613707165109, "grad_norm": 0.5334845185279846, "kl": 0.04171426221728325, "learning_rate": 1.78875e-06, "loss": 0.142, "num_tokens": 45024094.0, "reward": 1.4035788774490356, "reward_std": 0.07490548491477966, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.40357890725135803, "rewards/correct_reward_func/std": 0.11934227496385574, "step": 349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 1388.5238037109375, "completions/mean_terminated_length": 1388.5238037109375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.5451713395638629, "grad_norm": 0.5753508806228638, "kl": 0.04579620808362961, "learning_rate": 1.788125e-06, "loss": -0.003, "num_tokens": 45146580.0, "reward": 1.4059325456619263, "reward_std": 0.05845046043395996, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4059324264526367, "rewards/correct_reward_func/std": 0.14846326410770416, "step": 350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1354.6309814453125, "completions/mean_terminated_length": 1354.6309814453125, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.5467289719626168, "grad_norm": 0.6357160210609436, "kl": 0.045284371823072433, "learning_rate": 1.7875e-06, "loss": -0.0187, "num_tokens": 45266273.0, "reward": 1.4549281597137451, "reward_std": 0.07358434051275253, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45492807030677795, "rewards/correct_reward_func/std": 0.12501084804534912, "step": 351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1370.2261962890625, "completions/mean_terminated_length": 1370.2261962890625, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.5482866043613707, "grad_norm": 0.6501032114028931, "kl": 0.04532886669039726, "learning_rate": 1.786875e-06, "loss": 0.0287, "num_tokens": 45387228.0, "reward": 1.509010910987854, "reward_std": 0.09563028067350388, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5328204035758972, "rewards/correct_reward_func/std": 0.1301979273557663, "step": 352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1504.21435546875, "completions/mean_terminated_length": 1423.6385498046875, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 0.5498442367601246, "grad_norm": 0.614535391330719, "kl": 0.04563060216605663, "learning_rate": 1.7862499999999998e-06, "loss": 0.0562, "num_tokens": 45519630.0, "reward": 1.5245028734207153, "reward_std": 0.09191560745239258, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5245028138160706, "rewards/correct_reward_func/std": 0.19058886170387268, "step": 353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1371.3690185546875, "completions/mean_terminated_length": 1289.1927490234375, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.5514018691588785, "grad_norm": 0.5684062242507935, "kl": 0.04687408730387688, "learning_rate": 1.785625e-06, "loss": 0.1009, "num_tokens": 45640585.0, "reward": 1.423880696296692, "reward_std": 0.08402802050113678, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43578535318374634, "rewards/correct_reward_func/std": 0.14543381333351135, "step": 354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1358.0238037109375, "completions/mean_terminated_length": 1358.0238037109375, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 0.5529595015576324, "grad_norm": 0.6448848843574524, "kl": 0.048309145495295525, "learning_rate": 1.7849999999999999e-06, "loss": -0.0182, "num_tokens": 45760533.0, "reward": 1.4921796321868896, "reward_std": 0.07159780710935593, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4921795129776001, "rewards/correct_reward_func/std": 0.15320508182048798, "step": 355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 1354.71435546875, "completions/mean_terminated_length": 1354.71435546875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.5545171339563862, "grad_norm": 0.6176822185516357, "kl": 0.04802674613893032, "learning_rate": 1.784375e-06, "loss": -0.0059, "num_tokens": 45880269.0, "reward": 1.4876474142074585, "reward_std": 0.09487791359424591, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4995521306991577, "rewards/correct_reward_func/std": 0.13108977675437927, "step": 356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 1344.9285888671875, "completions/mean_terminated_length": 1344.9285888671875, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.5560747663551402, "grad_norm": 0.6490213871002197, "kl": 0.0472539346665144, "learning_rate": 1.7837499999999999e-06, "loss": 0.0013, "num_tokens": 45999249.0, "reward": 1.44069242477417, "reward_std": 0.11679985374212265, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.46450188755989075, "rewards/correct_reward_func/std": 0.13278451561927795, "step": 357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2629.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 1377.7857666015625, "completions/mean_terminated_length": 1377.7857666015625, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.557632398753894, "grad_norm": 0.6063095331192017, "kl": 0.045867305248975754, "learning_rate": 1.783125e-06, "loss": -0.0135, "num_tokens": 46121055.0, "reward": 1.4912810325622559, "reward_std": 0.07250796258449554, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49128106236457825, "rewards/correct_reward_func/std": 0.12492024898529053, "step": 358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2068.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 1362.761962890625, "completions/mean_terminated_length": 1362.761962890625, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 0.559190031152648, "grad_norm": 0.6036370992660522, "kl": 0.047231562435626984, "learning_rate": 1.7824999999999999e-06, "loss": -0.0009, "num_tokens": 46241479.0, "reward": 1.4975894689559937, "reward_std": 0.05849050357937813, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4975893795490265, "rewards/correct_reward_func/std": 0.18169310688972473, "step": 359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1369.9881591796875, "completions/mean_terminated_length": 1369.9881591796875, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.5607476635514018, "grad_norm": 0.582613468170166, "kl": 0.04558840952813625, "learning_rate": 1.781875e-06, "loss": 0.0201, "num_tokens": 46362546.0, "reward": 1.4481348991394043, "reward_std": 0.09183409065008163, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46003949642181396, "rewards/correct_reward_func/std": 0.13477934896945953, "step": 360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1380.261962890625, "completions/mean_terminated_length": 1380.261962890625, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 0.5623052959501558, "grad_norm": 0.6290069818496704, "kl": 0.04558514803647995, "learning_rate": 1.7812499999999999e-06, "loss": 0.0065, "num_tokens": 46484542.0, "reward": 1.4386088848114014, "reward_std": 0.09025963395833969, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4505135715007782, "rewards/correct_reward_func/std": 0.12712764739990234, "step": 361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 1388.9881591796875, "completions/mean_terminated_length": 1388.9881591796875, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 0.5638629283489096, "grad_norm": 0.6121560335159302, "kl": 0.046877965331077576, "learning_rate": 1.7806249999999998e-06, "loss": -0.0113, "num_tokens": 46607247.0, "reward": 1.4698020219802856, "reward_std": 0.09116669744253159, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48170679807662964, "rewards/correct_reward_func/std": 0.1016574278473854, "step": 362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1318.9881591796875, "completions/mean_terminated_length": 1318.9881591796875, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 0.5654205607476636, "grad_norm": 0.6227669715881348, "kl": 0.046381985768675804, "learning_rate": 1.78e-06, "loss": -0.0136, "num_tokens": 46724030.0, "reward": 1.4805115461349487, "reward_std": 0.1278102546930313, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5043209791183472, "rewards/correct_reward_func/std": 0.19428442418575287, "step": 363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 1378.107177734375, "completions/mean_terminated_length": 1378.107177734375, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "epoch": 0.5669781931464174, "grad_norm": 0.6208792328834534, "kl": 0.048077501356601715, "learning_rate": 1.7793749999999998e-06, "loss": 0.015, "num_tokens": 46845689.0, "reward": 1.4393149614334106, "reward_std": 0.06981474906206131, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43931499123573303, "rewards/correct_reward_func/std": 0.13650043308734894, "step": 364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 1391.65478515625, "completions/mean_terminated_length": 1309.722900390625, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.5685358255451713, "grad_norm": 0.624458909034729, "kl": 0.04737947881221771, "learning_rate": 1.77875e-06, "loss": 0.0523, "num_tokens": 46968504.0, "reward": 1.3942302465438843, "reward_std": 0.11320418864488602, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.41803956031799316, "rewards/correct_reward_func/std": 0.14469270408153534, "step": 365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1453.6905517578125, "completions/mean_terminated_length": 1372.5059814453125, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 0.5700934579439252, "grad_norm": 0.5907891988754272, "kl": 0.04476970434188843, "learning_rate": 1.7781249999999998e-06, "loss": 0.0414, "num_tokens": 47096638.0, "reward": 1.4558826684951782, "reward_std": 0.08813583105802536, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4558826684951782, "rewards/correct_reward_func/std": 0.1586223989725113, "step": 366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1346.416748046875, "completions/mean_terminated_length": 1346.416748046875, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 0.5716510903426791, "grad_norm": 0.6247698068618774, "kl": 0.04782709293067455, "learning_rate": 1.7775e-06, "loss": 0.0158, "num_tokens": 47215659.0, "reward": 1.4506251811981201, "reward_std": 0.061412323266267776, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4506250023841858, "rewards/correct_reward_func/std": 0.1287914216518402, "step": 367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1385.511962890625, "completions/mean_terminated_length": 1385.511962890625, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 0.573208722741433, "grad_norm": 0.5953041911125183, "kl": 0.04662996344268322, "learning_rate": 1.7768749999999998e-06, "loss": -0.0291, "num_tokens": 47338246.0, "reward": 1.61995267868042, "reward_std": 0.071071557700634, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.6199524998664856, "rewards/correct_reward_func/std": 0.15339720249176025, "step": 368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1350.6905517578125, "completions/mean_terminated_length": 1350.6905517578125, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.5747663551401869, "grad_norm": 0.596235990524292, "kl": 0.047611601650714874, "learning_rate": 1.77625e-06, "loss": 0.0142, "num_tokens": 47457746.0, "reward": 1.4904042482376099, "reward_std": 0.0895150825381279, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5023089647293091, "rewards/correct_reward_func/std": 0.10198992490768433, "step": 369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1335.142822265625, "completions/mean_terminated_length": 1335.142822265625, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.5763239875389408, "grad_norm": 0.6384495496749878, "kl": 0.04769720509648323, "learning_rate": 1.7756249999999998e-06, "loss": 0.0225, "num_tokens": 47575748.0, "reward": 1.4142802953720093, "reward_std": 0.08743462711572647, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4261849820613861, "rewards/correct_reward_func/std": 0.12853728234767914, "step": 370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1369.0595703125, "completions/mean_terminated_length": 1369.0595703125, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 0.5778816199376947, "grad_norm": 0.5966677069664001, "kl": 0.04758315160870552, "learning_rate": 1.7749999999999997e-06, "loss": 0.0182, "num_tokens": 47696935.0, "reward": 1.4828351736068726, "reward_std": 0.06344291567802429, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.482835054397583, "rewards/correct_reward_func/std": 0.13246676325798035, "step": 371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1849.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 1291.5714111328125, "completions/mean_terminated_length": 1291.5714111328125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5794392523364486, "grad_norm": 0.6525982618331909, "kl": 0.049696190282702446, "learning_rate": 1.774375e-06, "loss": -0.0193, "num_tokens": 47811241.0, "reward": 1.4084582328796387, "reward_std": 0.06577665358781815, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.40845808386802673, "rewards/correct_reward_func/std": 0.11660967767238617, "step": 372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 1518.5595703125, "completions/mean_terminated_length": 1438.1566162109375, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.5809968847352025, "grad_norm": 0.5672593116760254, "kl": 0.04501592554152012, "learning_rate": 1.77375e-06, "loss": 0.0895, "num_tokens": 47944788.0, "reward": 1.5155577659606934, "reward_std": 0.06096799299120903, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5155577659606934, "rewards/correct_reward_func/std": 0.16944406926631927, "step": 373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 1468.6309814453125, "completions/mean_terminated_length": 1468.6309814453125, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "epoch": 0.5825545171339563, "grad_norm": 0.5926371216773987, "kl": 0.04884720593690872, "learning_rate": 1.773125e-06, "loss": 0.0291, "num_tokens": 48074309.0, "reward": 1.5413291454315186, "reward_std": 0.074510857462883, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5413291454315186, "rewards/correct_reward_func/std": 0.15059438347816467, "step": 374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1333.75, "completions/mean_terminated_length": 1333.75, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.5841121495327103, "grad_norm": 0.6363287568092346, "kl": 0.04845425486564636, "learning_rate": 1.7725e-06, "loss": -0.0013, "num_tokens": 48192344.0, "reward": 1.4435120820999146, "reward_std": 0.09547659754753113, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.455416738986969, "rewards/correct_reward_func/std": 0.17547385394573212, "step": 375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 1348.09521484375, "completions/mean_terminated_length": 1348.09521484375, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.5856697819314641, "grad_norm": 0.6347678303718567, "kl": 0.04872422479093075, "learning_rate": 1.771875e-06, "loss": -0.0063, "num_tokens": 48311446.0, "reward": 1.532442569732666, "reward_std": 0.08060499280691147, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5324423909187317, "rewards/correct_reward_func/std": 0.1389501988887787, "step": 376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1335.6905517578125, "completions/mean_terminated_length": 1335.6905517578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5872274143302181, "grad_norm": 0.6710498929023743, "kl": 0.04991703853011131, "learning_rate": 1.77125e-06, "loss": -0.0097, "num_tokens": 48429530.0, "reward": 1.4852207899093628, "reward_std": 0.11480290442705154, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5090302228927612, "rewards/correct_reward_func/std": 0.18953540921211243, "step": 377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 1367.1190185546875, "completions/mean_terminated_length": 1367.1190185546875, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 0.5887850467289719, "grad_norm": 0.5975840091705322, "kl": 0.04870462976396084, "learning_rate": 1.7706249999999999e-06, "loss": 0.0098, "num_tokens": 48550482.0, "reward": 1.4639222621917725, "reward_std": 0.09818486869335175, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47582679986953735, "rewards/correct_reward_func/std": 0.15385620296001434, "step": 378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 1391.6190185546875, "completions/mean_terminated_length": 1391.6190185546875, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 0.5903426791277259, "grad_norm": 0.5850672721862793, "kl": 0.04981931112706661, "learning_rate": 1.77e-06, "loss": -0.0048, "num_tokens": 48673492.0, "reward": 1.4705395698547363, "reward_std": 0.07982930541038513, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4705394506454468, "rewards/correct_reward_func/std": 0.1553632766008377, "step": 379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1282.952392578125, "completions/mean_terminated_length": 1282.952392578125, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 0.5919003115264797, "grad_norm": 0.6228066682815552, "kl": 0.048284122720360756, "learning_rate": 1.769375e-06, "loss": -0.0227, "num_tokens": 48786996.0, "reward": 1.436045527458191, "reward_std": 0.05097164586186409, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4360455572605133, "rewards/correct_reward_func/std": 0.12165073305368423, "step": 380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1336.7738037109375, "completions/mean_terminated_length": 1336.7738037109375, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.5934579439252337, "grad_norm": 0.6182482838630676, "kl": 0.047869689762592316, "learning_rate": 1.76875e-06, "loss": -0.0094, "num_tokens": 48905447.0, "reward": 1.4301729202270508, "reward_std": 0.08307760953903198, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44207748770713806, "rewards/correct_reward_func/std": 0.16730858385562897, "step": 381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2161.0, "completions/max_terminated_length": 2161.0, "completions/mean_length": 1325.9881591796875, "completions/mean_terminated_length": 1325.9881591796875, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 0.5950155763239875, "grad_norm": 0.6429280638694763, "kl": 0.0489403922110796, "learning_rate": 1.768125e-06, "loss": -0.0063, "num_tokens": 49022764.0, "reward": 1.4742978811264038, "reward_std": 0.04766622185707092, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47429779171943665, "rewards/correct_reward_func/std": 0.11626514792442322, "step": 382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1350.09521484375, "completions/mean_terminated_length": 1350.09521484375, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.5965732087227414, "grad_norm": 0.6260592341423035, "kl": 0.05001649633049965, "learning_rate": 1.7675e-06, "loss": 0.0078, "num_tokens": 49142112.0, "reward": 1.5085965394973755, "reward_std": 0.06493545323610306, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5085963606834412, "rewards/correct_reward_func/std": 0.12024178355932236, "step": 383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1421.166748046875, "completions/mean_terminated_length": 1421.166748046875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.5981308411214953, "grad_norm": 0.577016294002533, "kl": 0.05048423446714878, "learning_rate": 1.766875e-06, "loss": 0.0171, "num_tokens": 49267316.0, "reward": 1.4990577697753906, "reward_std": 0.10350355505943298, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5109624266624451, "rewards/correct_reward_func/std": 0.17975008487701416, "step": 384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2037.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1388.6190185546875, "completions/mean_terminated_length": 1388.6190185546875, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 0.5996884735202492, "grad_norm": 0.6296937465667725, "kl": 0.048955587670207024, "learning_rate": 1.76625e-06, "loss": 0.0216, "num_tokens": 49390212.0, "reward": 1.4979599714279175, "reward_std": 0.09178230166435242, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5098646283149719, "rewards/correct_reward_func/std": 0.16648715734481812, "step": 385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1338.8214111328125, "completions/mean_terminated_length": 1338.8214111328125, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.6012461059190031, "grad_norm": 0.5855390429496765, "kl": 0.04758539795875549, "learning_rate": 1.765625e-06, "loss": -0.0172, "num_tokens": 49508787.0, "reward": 1.4601309299468994, "reward_std": 0.04890606552362442, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46013087034225464, "rewards/correct_reward_func/std": 0.1410851925611496, "step": 386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1426.46435546875, "completions/mean_terminated_length": 1344.9517822265625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.602803738317757, "grad_norm": 0.5801010131835938, "kl": 0.04862123541533947, "learning_rate": 1.7649999999999998e-06, "loss": 0.033, "num_tokens": 49634694.0, "reward": 1.5094380378723145, "reward_std": 0.07422788441181183, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5094379186630249, "rewards/correct_reward_func/std": 0.18413080275058746, "step": 387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1389.4761962890625, "completions/mean_terminated_length": 1389.4761962890625, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 0.6043613707165109, "grad_norm": 0.6577509641647339, "kl": 0.050652796402573586, "learning_rate": 1.764375e-06, "loss": 0.0029, "num_tokens": 49757398.0, "reward": 1.5519939661026, "reward_std": 0.06602007895708084, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5519937872886658, "rewards/correct_reward_func/std": 0.12210499495267868, "step": 388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1984.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1296.1785888671875, "completions/mean_terminated_length": 1296.1785888671875, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.6059190031152648, "grad_norm": 0.6384318470954895, "kl": 0.04982003942131996, "learning_rate": 1.7637499999999998e-06, "loss": -0.0317, "num_tokens": 49872283.0, "reward": 1.474208116531372, "reward_std": 0.056270867586135864, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4742080271244049, "rewards/correct_reward_func/std": 0.1628669947385788, "step": 389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2054.0, "completions/max_terminated_length": 2054.0, "completions/mean_length": 1370.7738037109375, "completions/mean_terminated_length": 1370.7738037109375, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 0.6074766355140186, "grad_norm": 0.6070489287376404, "kl": 0.04989171586930752, "learning_rate": 1.763125e-06, "loss": 0.0072, "num_tokens": 49993458.0, "reward": 1.4242897033691406, "reward_std": 0.12058194726705551, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4361944794654846, "rewards/correct_reward_func/std": 0.15444742143154144, "step": 390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1269.6190185546875, "completions/mean_terminated_length": 1269.6190185546875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.6090342679127726, "grad_norm": 0.6277110576629639, "kl": 0.05084827356040478, "learning_rate": 1.7624999999999999e-06, "loss": -0.0106, "num_tokens": 50105788.0, "reward": 1.4761323928833008, "reward_std": 0.08773455768823624, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47613224387168884, "rewards/correct_reward_func/std": 0.18977254629135132, "step": 391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1340.547607421875, "completions/mean_terminated_length": 1340.547607421875, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.6105919003115264, "grad_norm": 0.6418157815933228, "kl": 0.05028718709945679, "learning_rate": 1.761875e-06, "loss": 0.0052, "num_tokens": 50224304.0, "reward": 1.5189129114151, "reward_std": 0.06859312951564789, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5189128518104553, "rewards/correct_reward_func/std": 0.13187937438488007, "step": 392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1348.2857666015625, "completions/mean_terminated_length": 1348.2857666015625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.6121495327102804, "grad_norm": 0.6196921467781067, "kl": 0.05028197728097439, "learning_rate": 1.7612499999999999e-06, "loss": -0.0127, "num_tokens": 50343602.0, "reward": 1.4608813524246216, "reward_std": 0.0647798702120781, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4608812928199768, "rewards/correct_reward_func/std": 0.10814743489027023, "step": 393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1432.761962890625, "completions/mean_terminated_length": 1351.3251953125, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 0.6137071651090342, "grad_norm": 0.5915994048118591, "kl": 0.04747145250439644, "learning_rate": 1.760625e-06, "loss": 0.0774, "num_tokens": 50470044.0, "reward": 1.4953646659851074, "reward_std": 0.06458833068609238, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49536460638046265, "rewards/correct_reward_func/std": 0.15716253221035004, "step": 394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2262.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 1341.4761962890625, "completions/mean_terminated_length": 1341.4761962890625, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 0.6152647975077882, "grad_norm": 0.6374837756156921, "kl": 0.05029851756989956, "learning_rate": 1.7599999999999999e-06, "loss": -0.0057, "num_tokens": 50588620.0, "reward": 1.4544249773025513, "reward_std": 0.06668942421674728, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4544249176979065, "rewards/correct_reward_func/std": 0.1269298493862152, "step": 395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 1336.6785888671875, "completions/mean_terminated_length": 1336.6785888671875, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.616822429906542, "grad_norm": 0.6182777881622314, "kl": 0.04916258528828621, "learning_rate": 1.7593749999999998e-06, "loss": -0.0244, "num_tokens": 50706799.0, "reward": 1.5116595029830933, "reward_std": 0.056161068379879, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.511659562587738, "rewards/correct_reward_func/std": 0.17195159196853638, "step": 396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 1375.8690185546875, "completions/mean_terminated_length": 1375.8690185546875, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 0.618380062305296, "grad_norm": 0.60200434923172, "kl": 0.049690814688801765, "learning_rate": 1.7587499999999999e-06, "loss": -0.0061, "num_tokens": 50828270.0, "reward": 1.492004156112671, "reward_std": 0.07065374404191971, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4920039772987366, "rewards/correct_reward_func/std": 0.14335261285305023, "step": 397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 1516.9761962890625, "completions/mean_terminated_length": 1436.55419921875, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.6199376947040498, "grad_norm": 0.587050199508667, "kl": 0.048227181658148766, "learning_rate": 1.7581249999999998e-06, "loss": 0.0757, "num_tokens": 50961774.0, "reward": 1.5450599193572998, "reward_std": 0.09599590301513672, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.556964635848999, "rewards/correct_reward_func/std": 0.1692476123571396, "step": 398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 1380.5357666015625, "completions/mean_terminated_length": 1380.5357666015625, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 0.6214953271028038, "grad_norm": 0.550334095954895, "kl": 0.04830704443156719, "learning_rate": 1.7575e-06, "loss": -0.0391, "num_tokens": 51083847.0, "reward": 1.4870414733886719, "reward_std": 0.06050838157534599, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48704153299331665, "rewards/correct_reward_func/std": 0.13304997980594635, "step": 399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1446.96435546875, "completions/mean_terminated_length": 1446.96435546875, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.6230529595015576, "grad_norm": 0.6399713754653931, "kl": 0.04986717738211155, "learning_rate": 1.7568749999999998e-06, "loss": -0.0089, "num_tokens": 51211506.0, "reward": 1.505528211593628, "reward_std": 0.057832684367895126, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5055281519889832, "rewards/correct_reward_func/std": 0.18946535885334015, "step": 400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2223.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 1390.4285888671875, "completions/mean_terminated_length": 1390.4285888671875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.6246105919003115, "grad_norm": 0.6153919100761414, "kl": 0.048483846709132195, "learning_rate": 1.75625e-06, "loss": 0.0004, "num_tokens": 51334536.0, "reward": 1.5144027471542358, "reward_std": 0.046569447964429855, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5144026875495911, "rewards/correct_reward_func/std": 0.10261337459087372, "step": 401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1555.4761962890625, "completions/mean_terminated_length": 1475.51806640625, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 0.6261682242990654, "grad_norm": 0.5729800462722778, "kl": 0.04777614213526249, "learning_rate": 1.7556249999999998e-06, "loss": 0.0621, "num_tokens": 51471334.0, "reward": 1.477668285369873, "reward_std": 0.08024942129850388, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47766822576522827, "rewards/correct_reward_func/std": 0.12405380606651306, "step": 402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1394.0714111328125, "completions/mean_terminated_length": 1394.0714111328125, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 0.6277258566978193, "grad_norm": 0.6006665229797363, "kl": 0.048750247806310654, "learning_rate": 1.7549999999999997e-06, "loss": 0.0141, "num_tokens": 51594508.0, "reward": 1.5268176794052124, "reward_std": 0.06284648180007935, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5268176794052124, "rewards/correct_reward_func/std": 0.1302812248468399, "step": 403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1379.416748046875, "completions/mean_terminated_length": 1379.416748046875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6292834890965732, "grad_norm": 0.6163754463195801, "kl": 0.0525053720921278, "learning_rate": 1.754375e-06, "loss": -0.0313, "num_tokens": 51716115.0, "reward": 1.5108131170272827, "reward_std": 0.09598790854215622, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5108129382133484, "rewards/correct_reward_func/std": 0.17914439737796783, "step": 404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2625.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 1451.5595703125, "completions/mean_terminated_length": 1451.5595703125, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.6308411214953271, "grad_norm": 0.6257968544960022, "kl": 0.0501710157841444, "learning_rate": 1.75375e-06, "loss": -0.0006, "num_tokens": 51843950.0, "reward": 1.4702588319778442, "reward_std": 0.08658844977617264, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48216357827186584, "rewards/correct_reward_func/std": 0.13416936993598938, "step": 405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 1460.0, "completions/mean_terminated_length": 1460.0, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.632398753894081, "grad_norm": 0.5904620885848999, "kl": 0.049389807507395744, "learning_rate": 1.753125e-06, "loss": -0.0261, "num_tokens": 51972656.0, "reward": 1.5546208620071411, "reward_std": 0.06326793879270554, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5546208024024963, "rewards/correct_reward_func/std": 0.1749102622270584, "step": 406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3438.0, "completions/max_terminated_length": 3438.0, "completions/mean_length": 1483.4761962890625, "completions/mean_terminated_length": 1483.4761962890625, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.6339563862928349, "grad_norm": 0.5588891506195068, "kl": 0.04781218431890011, "learning_rate": 1.7525e-06, "loss": 0.0121, "num_tokens": 52103292.0, "reward": 1.540654182434082, "reward_std": 0.06845831125974655, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5406539440155029, "rewards/correct_reward_func/std": 0.22292810678482056, "step": 407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1564.34521484375, "completions/mean_terminated_length": 1564.34521484375, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.6355140186915887, "grad_norm": 0.568066418170929, "kl": 0.050099100917577744, "learning_rate": 1.751875e-06, "loss": 0.0297, "num_tokens": 52240961.0, "reward": 1.460665225982666, "reward_std": 0.04765839874744415, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46066510677337646, "rewards/correct_reward_func/std": 0.09499367326498032, "step": 408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2255.0, "completions/max_terminated_length": 2255.0, "completions/mean_length": 1502.107177734375, "completions/mean_terminated_length": 1502.107177734375, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.6370716510903427, "grad_norm": 0.6036926507949829, "kl": 0.049950817599892616, "learning_rate": 1.75125e-06, "loss": -0.0168, "num_tokens": 52373144.0, "reward": 1.4840716123580933, "reward_std": 0.09531796723604202, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4959762990474701, "rewards/correct_reward_func/std": 0.1371496170759201, "step": 409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.04761904761904767, "completions/max_length": 8192.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 1858.1905517578125, "completions/mean_terminated_length": 1541.5, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 0.6386292834890965, "grad_norm": 0.498012512922287, "kl": 0.04354623891413212, "learning_rate": 1.750625e-06, "loss": 0.2021, "num_tokens": 52535262.0, "reward": 1.4536290168762207, "reward_std": 0.09940145164728165, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4536289870738983, "rewards/correct_reward_func/std": 0.18348245322704315, "step": 410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3503.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 1549.952392578125, "completions/mean_terminated_length": 1549.952392578125, "completions/min_length": 981.0, "completions/min_terminated_length": 981.0, "epoch": 0.6401869158878505, "grad_norm": 0.5648190379142761, "kl": 0.048112260177731514, "learning_rate": 1.75e-06, "loss": 0.004, "num_tokens": 52671566.0, "reward": 1.4433013200759888, "reward_std": 0.09281626343727112, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45520591735839844, "rewards/correct_reward_func/std": 0.1479065865278244, "step": 411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1474.416748046875, "completions/mean_terminated_length": 1474.416748046875, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.6417445482866043, "grad_norm": 0.5981578826904297, "kl": 0.05129780061542988, "learning_rate": 1.7493749999999999e-06, "loss": 0.0015, "num_tokens": 52801327.0, "reward": 1.5297049283981323, "reward_std": 0.07844500243663788, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5297048687934875, "rewards/correct_reward_func/std": 0.17416192591190338, "step": 412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2826.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 1552.8214111328125, "completions/mean_terminated_length": 1552.8214111328125, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.6433021806853583, "grad_norm": 0.551489531993866, "kl": 0.04923750273883343, "learning_rate": 1.74875e-06, "loss": -0.005, "num_tokens": 52937830.0, "reward": 1.544388771057129, "reward_std": 0.07821746915578842, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5562934875488281, "rewards/correct_reward_func/std": 0.155470609664917, "step": 413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 1587.2857666015625, "completions/mean_terminated_length": 1587.2857666015625, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "epoch": 0.6448598130841121, "grad_norm": 0.5425035953521729, "kl": 0.049748532474040985, "learning_rate": 1.7481249999999999e-06, "loss": -0.0384, "num_tokens": 53077198.0, "reward": 1.522072434425354, "reward_std": 0.10684633255004883, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5339770317077637, "rewards/correct_reward_func/std": 0.17637047171592712, "step": 414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2465.0, "completions/mean_length": 1585.9405517578125, "completions/mean_terminated_length": 1506.349365234375, "completions/min_length": 975.0, "completions/min_terminated_length": 975.0, "epoch": 0.6464174454828661, "grad_norm": 0.5822159647941589, "kl": 0.04930712282657623, "learning_rate": 1.7475e-06, "loss": 0.0505, "num_tokens": 53216147.0, "reward": 1.520836591720581, "reward_std": 0.09006838500499725, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5208365321159363, "rewards/correct_reward_func/std": 0.19083261489868164, "step": 415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 1632.4405517578125, "completions/mean_terminated_length": 1553.4095458984375, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 0.6479750778816199, "grad_norm": 0.5585830211639404, "kl": 0.04699916951358318, "learning_rate": 1.746875e-06, "loss": -0.0579, "num_tokens": 53359182.0, "reward": 1.4239436388015747, "reward_std": 0.05681487172842026, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42394357919692993, "rewards/correct_reward_func/std": 0.12871341407299042, "step": 416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3221.0, "completions/max_terminated_length": 3221.0, "completions/mean_length": 1636.5357666015625, "completions/mean_terminated_length": 1636.5357666015625, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.6495327102803738, "grad_norm": 0.6002727150917053, "kl": 0.05007455497980118, "learning_rate": 1.74625e-06, "loss": 0.0327, "num_tokens": 53502591.0, "reward": 1.4320467710494995, "reward_std": 0.05887793377041817, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4320466220378876, "rewards/correct_reward_func/std": 0.11791915446519852, "step": 417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 1614.452392578125, "completions/mean_terminated_length": 1535.2047119140625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.6510903426791277, "grad_norm": 0.601240873336792, "kl": 0.051675185561180115, "learning_rate": 1.745625e-06, "loss": 0.0749, "num_tokens": 53644247.0, "reward": 1.486029028892517, "reward_std": 0.07044512033462524, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48602885007858276, "rewards/correct_reward_func/std": 0.1468798667192459, "step": 418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3119.0, "completions/max_terminated_length": 3119.0, "completions/mean_length": 1591.75, "completions/mean_terminated_length": 1591.75, "completions/min_length": 1031.0, "completions/min_terminated_length": 1031.0, "epoch": 0.6526479750778816, "grad_norm": 0.6253805756568909, "kl": 0.051795635372400284, "learning_rate": 1.745e-06, "loss": -0.0031, "num_tokens": 53783720.0, "reward": 1.4498029947280884, "reward_std": 0.08419051766395569, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46170762181282043, "rewards/correct_reward_func/std": 0.17074517905712128, "step": 419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 1521.8214111328125, "completions/mean_terminated_length": 1521.8214111328125, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.6542056074766355, "grad_norm": 0.6050965785980225, "kl": 0.05233046971261501, "learning_rate": 1.744375e-06, "loss": -0.019, "num_tokens": 53917445.0, "reward": 1.435978651046753, "reward_std": 0.12028548866510391, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.459788054227829, "rewards/correct_reward_func/std": 0.14839471876621246, "step": 420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 1687.952392578125, "completions/mean_terminated_length": 1609.59033203125, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 0.6557632398753894, "grad_norm": 0.542452871799469, "kl": 0.05065176263451576, "learning_rate": 1.7437499999999998e-06, "loss": 0.0578, "num_tokens": 54065257.0, "reward": 1.4500166177749634, "reward_std": 0.09093791991472244, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4500165581703186, "rewards/correct_reward_func/std": 0.16098248958587646, "step": 421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 1550.5714111328125, "completions/mean_terminated_length": 1550.5714111328125, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.6573208722741433, "grad_norm": 0.5730518102645874, "kl": 0.050289461389184, "learning_rate": 1.743125e-06, "loss": -0.0254, "num_tokens": 54201613.0, "reward": 1.4041987657546997, "reward_std": 0.046286944299936295, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4041987359523773, "rewards/correct_reward_func/std": 0.13344568014144897, "step": 422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1631.297607421875, "completions/mean_terminated_length": 1552.2529296875, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 0.6588785046728972, "grad_norm": 0.5699371099472046, "kl": 0.05113396793603897, "learning_rate": 1.7424999999999998e-06, "loss": 0.0358, "num_tokens": 54344702.0, "reward": 1.4777201414108276, "reward_std": 0.07488631457090378, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47772011160850525, "rewards/correct_reward_func/std": 0.13865311443805695, "step": 423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1677.59521484375, "completions/mean_terminated_length": 1599.1083984375, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.660436137071651, "grad_norm": 0.5397751927375793, "kl": 0.05114184692502022, "learning_rate": 1.741875e-06, "loss": -0.0164, "num_tokens": 54491662.0, "reward": 1.4992514848709106, "reward_std": 0.0635334923863411, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4992513060569763, "rewards/correct_reward_func/std": 0.1298021823167801, "step": 424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 1552.1429443359375, "completions/mean_terminated_length": 1552.1429443359375, "completions/min_length": 1036.0, "completions/min_terminated_length": 1036.0, "epoch": 0.661993769470405, "grad_norm": 0.5741438269615173, "kl": 0.05266575887799263, "learning_rate": 1.7412499999999998e-06, "loss": 0.0127, "num_tokens": 54627778.0, "reward": 1.4440009593963623, "reward_std": 0.08349818736314774, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45590564608573914, "rewards/correct_reward_func/std": 0.17922662198543549, "step": 425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2499.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 1625.3929443359375, "completions/mean_terminated_length": 1625.3929443359375, "completions/min_length": 981.0, "completions/min_terminated_length": 981.0, "epoch": 0.6635514018691588, "grad_norm": 0.5885212421417236, "kl": 0.05119376443326473, "learning_rate": 1.740625e-06, "loss": 0.0118, "num_tokens": 54770425.0, "reward": 1.473638653755188, "reward_std": 0.0765259712934494, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4736386239528656, "rewards/correct_reward_func/std": 0.10725454986095428, "step": 426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 1690.8333740234375, "completions/mean_terminated_length": 1612.5059814453125, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 0.6651090342679128, "grad_norm": 0.5927914381027222, "kl": 0.04985920339822769, "learning_rate": 1.7399999999999999e-06, "loss": 0.0531, "num_tokens": 54918389.0, "reward": 1.4642417430877686, "reward_std": 0.12046536058187485, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4761464297771454, "rewards/correct_reward_func/std": 0.1584354043006897, "step": 427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1637.1190185546875, "completions/mean_terminated_length": 1637.1190185546875, "completions/min_length": 1066.0, "completions/min_terminated_length": 1066.0, "epoch": 0.6666666666666666, "grad_norm": 0.5880814790725708, "kl": 0.05253339186310768, "learning_rate": 1.7393749999999998e-06, "loss": -0.0262, "num_tokens": 55061943.0, "reward": 1.5044912099838257, "reward_std": 0.08261405676603317, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5044911503791809, "rewards/correct_reward_func/std": 0.17851826548576355, "step": 428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 1644.8929443359375, "completions/mean_terminated_length": 1566.011962890625, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "epoch": 0.6682242990654206, "grad_norm": 0.5974320769309998, "kl": 0.04985599033534527, "learning_rate": 1.7387499999999999e-06, "loss": 0.0317, "num_tokens": 55205916.0, "reward": 1.472242832183838, "reward_std": 0.13399535417556763, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4960523247718811, "rewards/correct_reward_func/std": 0.16596059501171112, "step": 429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 1612.9881591796875, "completions/mean_terminated_length": 1612.9881591796875, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.6697819314641744, "grad_norm": 0.5865140557289124, "kl": 0.05195549875497818, "learning_rate": 1.7381249999999998e-06, "loss": 0.0019, "num_tokens": 55347581.0, "reward": 1.4284037351608276, "reward_std": 0.09993235766887665, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.45221319794654846, "rewards/correct_reward_func/std": 0.11526290327310562, "step": 430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 1535.5714111328125, "completions/mean_terminated_length": 1535.5714111328125, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 0.6713395638629284, "grad_norm": 0.5939237475395203, "kl": 0.05092081241309643, "learning_rate": 1.7374999999999999e-06, "loss": 0.0117, "num_tokens": 55482497.0, "reward": 1.5227851867675781, "reward_std": 0.09420502930879593, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5346897840499878, "rewards/correct_reward_func/std": 0.13495904207229614, "step": 431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 1559.416748046875, "completions/mean_terminated_length": 1559.416748046875, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.6728971962616822, "grad_norm": 0.5865123271942139, "kl": 0.05216217786073685, "learning_rate": 1.7368749999999998e-06, "loss": 0.0131, "num_tokens": 55619320.0, "reward": 1.5050667524337769, "reward_std": 0.08095559477806091, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5169714689254761, "rewards/correct_reward_func/std": 0.18940287828445435, "step": 432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 1583.1429443359375, "completions/mean_terminated_length": 1583.1429443359375, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 0.6744548286604362, "grad_norm": 0.6121124625205994, "kl": 0.04994286224246025, "learning_rate": 1.7362499999999999e-06, "loss": 0.0266, "num_tokens": 55758322.0, "reward": 1.4305202960968018, "reward_std": 0.0557343065738678, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43052029609680176, "rewards/correct_reward_func/std": 0.15990039706230164, "step": 433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 4451.0, "completions/mean_length": 1662.0357666015625, "completions/mean_terminated_length": 1583.361328125, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.67601246105919, "grad_norm": 0.5868165493011475, "kl": 0.04885072074830532, "learning_rate": 1.7356249999999998e-06, "loss": 0.0522, "num_tokens": 55903879.0, "reward": 1.4234789609909058, "reward_std": 0.1545991748571396, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.47109803557395935, "rewards/correct_reward_func/std": 0.1350628137588501, "step": 434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 1513.5714111328125, "completions/mean_terminated_length": 1513.5714111328125, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "epoch": 0.677570093457944, "grad_norm": 0.5805670619010925, "kl": 0.052099065855145454, "learning_rate": 1.7350000000000001e-06, "loss": -0.0069, "num_tokens": 56036809.0, "reward": 1.4437384605407715, "reward_std": 0.05860109254717827, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44373825192451477, "rewards/correct_reward_func/std": 0.19147835671901703, "step": 435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 1667.5238037109375, "completions/mean_terminated_length": 1588.9156494140625, "completions/min_length": 975.0, "completions/min_terminated_length": 975.0, "epoch": 0.6791277258566978, "grad_norm": 0.5827205181121826, "kl": 0.048958078026771545, "learning_rate": 1.734375e-06, "loss": 0.0677, "num_tokens": 56182893.0, "reward": 1.4653565883636475, "reward_std": 0.08377533406019211, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4653565585613251, "rewards/correct_reward_func/std": 0.17159578204154968, "step": 436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 1661.6785888671875, "completions/mean_terminated_length": 1583.0, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.6806853582554517, "grad_norm": 0.5805427432060242, "kl": 0.04939436540007591, "learning_rate": 1.73375e-06, "loss": -0.0382, "num_tokens": 56328444.0, "reward": 1.443485140800476, "reward_std": 0.06904201209545135, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44348499178886414, "rewards/correct_reward_func/std": 0.1347956359386444, "step": 437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 1691.107177734375, "completions/mean_terminated_length": 1612.7830810546875, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 0.6822429906542056, "grad_norm": 0.5515686273574829, "kl": 0.05067290551960468, "learning_rate": 1.733125e-06, "loss": 0.0507, "num_tokens": 56476701.0, "reward": 1.4295130968093872, "reward_std": 0.11515135318040848, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4533223509788513, "rewards/correct_reward_func/std": 0.16026785969734192, "step": 438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2201.0, "completions/max_terminated_length": 2201.0, "completions/mean_length": 1521.6309814453125, "completions/mean_terminated_length": 1521.6309814453125, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.6838006230529595, "grad_norm": 0.5936988592147827, "kl": 0.052045663818717, "learning_rate": 1.7325e-06, "loss": 0.031, "num_tokens": 56610566.0, "reward": 1.4802929162979126, "reward_std": 0.08335726708173752, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.504102349281311, "rewards/correct_reward_func/std": 0.1541454792022705, "step": 439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2006.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1464.0595703125, "completions/mean_terminated_length": 1464.0595703125, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 0.6853582554517134, "grad_norm": 0.6046100854873657, "kl": 0.05066749081015587, "learning_rate": 1.731875e-06, "loss": -0.006, "num_tokens": 56739727.0, "reward": 1.4421093463897705, "reward_std": 0.05164036527276039, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4421093463897705, "rewards/correct_reward_func/std": 0.14187321066856384, "step": 440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2175.0, "completions/mean_length": 1621.3929443359375, "completions/mean_terminated_length": 1461.134033203125, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.6869158878504673, "grad_norm": 0.5737205147743225, "kl": 0.04900176823139191, "learning_rate": 1.73125e-06, "loss": 0.0934, "num_tokens": 56881816.0, "reward": 1.42433500289917, "reward_std": 0.0748237892985344, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42433494329452515, "rewards/correct_reward_func/std": 0.18206322193145752, "step": 441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 1491.047607421875, "completions/mean_terminated_length": 1491.047607421875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.6884735202492211, "grad_norm": 0.6053724884986877, "kl": 0.052103569731116295, "learning_rate": 1.730625e-06, "loss": -0.0204, "num_tokens": 57012968.0, "reward": 1.5510308742523193, "reward_std": 0.09038885682821274, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5629354119300842, "rewards/correct_reward_func/std": 0.22346119582653046, "step": 442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 1477.84521484375, "completions/mean_terminated_length": 1477.84521484375, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 0.6900311526479751, "grad_norm": 0.5881515741348267, "kl": 0.05128224939107895, "learning_rate": 1.73e-06, "loss": -0.0017, "num_tokens": 57143023.0, "reward": 1.4883114099502563, "reward_std": 0.062143657356500626, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48831140995025635, "rewards/correct_reward_func/std": 0.13774645328521729, "step": 443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1422.7261962890625, "completions/mean_terminated_length": 1422.7261962890625, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.6915887850467289, "grad_norm": 0.6870121359825134, "kl": 0.0524381622672081, "learning_rate": 1.729375e-06, "loss": 0.0179, "num_tokens": 57268346.0, "reward": 1.4325767755508423, "reward_std": 0.11797544360160828, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4444815218448639, "rewards/correct_reward_func/std": 0.13778088986873627, "step": 444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1486.011962890625, "completions/mean_terminated_length": 1486.011962890625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.6931464174454829, "grad_norm": 0.589530348777771, "kl": 0.04958914779126644, "learning_rate": 1.72875e-06, "loss": -0.0082, "num_tokens": 57399075.0, "reward": 1.4617598056793213, "reward_std": 0.0732613280415535, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4617597460746765, "rewards/correct_reward_func/std": 0.20478412508964539, "step": 445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 1440.261962890625, "completions/mean_terminated_length": 1440.261962890625, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.6947040498442367, "grad_norm": 0.5789417624473572, "kl": 0.049345508217811584, "learning_rate": 1.7281249999999999e-06, "loss": -0.0048, "num_tokens": 57526129.0, "reward": 1.4764251708984375, "reward_std": 0.060231760144233704, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4764251708984375, "rewards/correct_reward_func/std": 0.15916836261749268, "step": 446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1481.5, "completions/mean_terminated_length": 1481.5, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.6962616822429907, "grad_norm": 0.5849701762199402, "kl": 0.052394647151231766, "learning_rate": 1.7275e-06, "loss": -0.0097, "num_tokens": 57656521.0, "reward": 1.5263036489486694, "reward_std": 0.07463201880455017, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5263035893440247, "rewards/correct_reward_func/std": 0.17446979880332947, "step": 447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3242.0, "completions/max_terminated_length": 3242.0, "completions/mean_length": 1495.3095703125, "completions/mean_terminated_length": 1495.3095703125, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.6978193146417445, "grad_norm": 0.628301203250885, "kl": 0.05113241821527481, "learning_rate": 1.7268749999999999e-06, "loss": 0.0129, "num_tokens": 57788163.0, "reward": 1.4820013046264648, "reward_std": 0.09342510253190994, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49390602111816406, "rewards/correct_reward_func/std": 0.12805330753326416, "step": 448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 1449.011962890625, "completions/mean_terminated_length": 1449.011962890625, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.6993769470404985, "grad_norm": 0.626068651676178, "kl": 0.05249497666954994, "learning_rate": 1.72625e-06, "loss": 0.0168, "num_tokens": 57916012.0, "reward": 1.4972912073135376, "reward_std": 0.07907280325889587, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.509195864200592, "rewards/correct_reward_func/std": 0.15952186286449432, "step": 449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1396.7381591796875, "completions/mean_terminated_length": 1396.7381591796875, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.7009345794392523, "grad_norm": 0.5788159966468811, "kl": 0.0519126933068037, "learning_rate": 1.7256249999999999e-06, "loss": -0.0161, "num_tokens": 58039266.0, "reward": 1.5347965955734253, "reward_std": 0.05890589952468872, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5347966551780701, "rewards/correct_reward_func/std": 0.13743919134140015, "step": 450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1381.011962890625, "completions/mean_terminated_length": 1381.011962890625, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 0.7024922118380063, "grad_norm": 0.6056314706802368, "kl": 0.05103152059018612, "learning_rate": 1.725e-06, "loss": 0.0104, "num_tokens": 58161217.0, "reward": 1.479777455329895, "reward_std": 0.06452760100364685, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47977739572525024, "rewards/correct_reward_func/std": 0.1589631587266922, "step": 451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1997.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1404.9285888671875, "completions/mean_terminated_length": 1404.9285888671875, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 0.7040498442367601, "grad_norm": 0.6056322455406189, "kl": 0.05121096037328243, "learning_rate": 1.724375e-06, "loss": 0.0433, "num_tokens": 58285201.0, "reward": 1.568735957145691, "reward_std": 0.0742240622639656, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5687359571456909, "rewards/correct_reward_func/std": 0.1560218781232834, "step": 452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 1456.1429443359375, "completions/mean_terminated_length": 1374.9879150390625, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.705607476635514, "grad_norm": 0.6087128520011902, "kl": 0.05143558606505394, "learning_rate": 1.7237499999999998e-06, "loss": 0.0035, "num_tokens": 58413421.0, "reward": 1.4904062747955322, "reward_std": 0.09947887063026428, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5023109912872314, "rewards/correct_reward_func/std": 0.18832674622535706, "step": 453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 1527.4285888671875, "completions/mean_terminated_length": 1527.4285888671875, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.7071651090342679, "grad_norm": 0.607707679271698, "kl": 0.054172057658433914, "learning_rate": 1.723125e-06, "loss": -0.0295, "num_tokens": 58547887.0, "reward": 1.4630801677703857, "reward_std": 0.07300285249948502, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4749848544597626, "rewards/correct_reward_func/std": 0.16017581522464752, "step": 454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 1457.2857666015625, "completions/mean_terminated_length": 1457.2857666015625, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 0.7087227414330218, "grad_norm": 0.6382337212562561, "kl": 0.05202684551477432, "learning_rate": 1.7224999999999998e-06, "loss": -0.0015, "num_tokens": 58676275.0, "reward": 1.4991950988769531, "reward_std": 0.10090982168912888, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5110997557640076, "rewards/correct_reward_func/std": 0.14260563254356384, "step": 455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 1584.9285888671875, "completions/mean_terminated_length": 1423.7803955078125, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.7102803738317757, "grad_norm": 0.567070722579956, "kl": 0.08697609417140484, "learning_rate": 1.721875e-06, "loss": 0.104, "num_tokens": 58815337.0, "reward": 1.4610228538513184, "reward_std": 0.08824088424444199, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46102282404899597, "rewards/correct_reward_func/std": 0.17577558755874634, "step": 456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 1508.1190185546875, "completions/mean_terminated_length": 1427.59033203125, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 0.7118380062305296, "grad_norm": 0.579940140247345, "kl": 0.04937991686165333, "learning_rate": 1.7212499999999998e-06, "loss": 0.0414, "num_tokens": 58947815.0, "reward": 1.44125497341156, "reward_std": 0.09176965802907944, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4531596899032593, "rewards/correct_reward_func/std": 0.11111555248498917, "step": 457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1457.7261962890625, "completions/mean_terminated_length": 1457.7261962890625, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 0.7133956386292835, "grad_norm": 0.5890275239944458, "kl": 0.04897093586623669, "learning_rate": 1.720625e-06, "loss": -0.0176, "num_tokens": 59076216.0, "reward": 1.4860827922821045, "reward_std": 0.08254723250865936, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4979875087738037, "rewards/correct_reward_func/std": 0.14583896100521088, "step": 458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 1443.0, "completions/mean_terminated_length": 1443.0, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "epoch": 0.7149532710280374, "grad_norm": 0.6299088001251221, "kl": 0.05053492821753025, "learning_rate": 1.7199999999999998e-06, "loss": 0.0295, "num_tokens": 59203470.0, "reward": 1.4992488622665405, "reward_std": 0.08619312942028046, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5111536979675293, "rewards/correct_reward_func/std": 0.1396382600069046, "step": 459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1419.607177734375, "completions/mean_terminated_length": 1419.607177734375, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 0.7165109034267912, "grad_norm": 0.6011829376220703, "kl": 0.051410723477602005, "learning_rate": 1.719375e-06, "loss": 0.0157, "num_tokens": 59328723.0, "reward": 1.5500842332839966, "reward_std": 0.06590086221694946, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5500842332839966, "rewards/correct_reward_func/std": 0.15555351972579956, "step": 460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 1639.3690185546875, "completions/mean_terminated_length": 1479.5487060546875, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 0.7180685358255452, "grad_norm": 0.5221880078315735, "kl": 0.04706592485308647, "learning_rate": 1.7187499999999998e-06, "loss": 0.1247, "num_tokens": 59472472.0, "reward": 1.4629874229431152, "reward_std": 0.11201505362987518, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47489219903945923, "rewards/correct_reward_func/std": 0.16951104998588562, "step": 461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 1470.6785888671875, "completions/mean_terminated_length": 1470.6785888671875, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 0.719626168224299, "grad_norm": 0.605837345123291, "kl": 0.04970187321305275, "learning_rate": 1.7181249999999997e-06, "loss": 0.0014, "num_tokens": 59602219.0, "reward": 1.5099036693572998, "reward_std": 0.06017957627773285, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5099035501480103, "rewards/correct_reward_func/std": 0.1773725152015686, "step": 462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 1585.297607421875, "completions/mean_terminated_length": 1585.297607421875, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.721183800623053, "grad_norm": 0.5602222681045532, "kl": 0.049900198355317116, "learning_rate": 1.7174999999999999e-06, "loss": -0.0306, "num_tokens": 59741534.0, "reward": 1.505386233329773, "reward_std": 0.07570932060480118, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5053861141204834, "rewards/correct_reward_func/std": 0.11848840862512589, "step": 463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 1453.7381591796875, "completions/mean_terminated_length": 1453.7381591796875, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 0.7227414330218068, "grad_norm": 0.5771290063858032, "kl": 0.05098097398877144, "learning_rate": 1.7168749999999998e-06, "loss": -0.0083, "num_tokens": 59869402.0, "reward": 1.41793692111969, "reward_std": 0.07035666704177856, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41793686151504517, "rewards/correct_reward_func/std": 0.12359312176704407, "step": 464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1601.15478515625, "completions/mean_terminated_length": 1601.15478515625, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.7242990654205608, "grad_norm": 0.5878456830978394, "kl": 0.051242388784885406, "learning_rate": 1.7162499999999999e-06, "loss": -0.0256, "num_tokens": 60010073.0, "reward": 1.5170272588729858, "reward_std": 0.08878003805875778, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5289318561553955, "rewards/correct_reward_func/std": 0.1320001482963562, "step": 465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3673.0, "completions/max_terminated_length": 3673.0, "completions/mean_length": 1584.96435546875, "completions/mean_terminated_length": 1584.96435546875, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.7258566978193146, "grad_norm": 0.56437748670578, "kl": 0.05264845862984657, "learning_rate": 1.7156249999999998e-06, "loss": -0.0124, "num_tokens": 60149144.0, "reward": 1.449500560760498, "reward_std": 0.06773830950260162, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44950050115585327, "rewards/correct_reward_func/std": 0.15649531781673431, "step": 466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 1570.761962890625, "completions/mean_terminated_length": 1570.761962890625, "completions/min_length": 989.0, "completions/min_terminated_length": 989.0, "epoch": 0.7274143302180686, "grad_norm": 0.586487352848053, "kl": 0.05096551589667797, "learning_rate": 1.715e-06, "loss": -0.002, "num_tokens": 60287136.0, "reward": 1.5255881547927856, "reward_std": 0.08541964739561081, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5374928712844849, "rewards/correct_reward_func/std": 0.18721869587898254, "step": 467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2230.0, "completions/max_terminated_length": 2230.0, "completions/mean_length": 1424.4405517578125, "completions/mean_terminated_length": 1424.4405517578125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.7289719626168224, "grad_norm": 0.5780627727508545, "kl": 0.05155480466783047, "learning_rate": 1.714375e-06, "loss": -0.0576, "num_tokens": 60412483.0, "reward": 1.4178261756896973, "reward_std": 0.0780172049999237, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4178261458873749, "rewards/correct_reward_func/std": 0.15439294278621674, "step": 468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 1656.5357666015625, "completions/mean_terminated_length": 1656.5357666015625, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.7305295950155763, "grad_norm": 0.602411687374115, "kl": 0.05079780891537666, "learning_rate": 1.7137500000000001e-06, "loss": -0.0068, "num_tokens": 60557530.0, "reward": 1.455553412437439, "reward_std": 0.05027348920702934, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45555347204208374, "rewards/correct_reward_func/std": 0.08646845817565918, "step": 469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2639.0, "completions/mean_length": 1774.96435546875, "completions/mean_terminated_length": 1697.6505126953125, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 0.7320872274143302, "grad_norm": 0.5296007394790649, "kl": 0.04803318716585636, "learning_rate": 1.713125e-06, "loss": 0.0229, "num_tokens": 60712831.0, "reward": 1.4119234085083008, "reward_std": 0.0688636526465416, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.411923348903656, "rewards/correct_reward_func/std": 0.15681524574756622, "step": 470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1522.75, "completions/mean_terminated_length": 1522.75, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 0.7336448598130841, "grad_norm": 0.5558991432189941, "kl": 0.051254723221063614, "learning_rate": 1.7125e-06, "loss": 0.0205, "num_tokens": 60846634.0, "reward": 1.4586231708526611, "reward_std": 0.08559418469667435, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4705279767513275, "rewards/correct_reward_func/std": 0.13128410279750824, "step": 471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 1663.6785888671875, "completions/mean_terminated_length": 1663.6785888671875, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.735202492211838, "grad_norm": 0.5688772797584534, "kl": 0.0507583636790514, "learning_rate": 1.711875e-06, "loss": 0.0262, "num_tokens": 60992275.0, "reward": 1.5023914575576782, "reward_std": 0.1096097081899643, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5142960548400879, "rewards/correct_reward_func/std": 0.14677509665489197, "step": 472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 1556.7738037109375, "completions/mean_terminated_length": 1556.7738037109375, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 0.7367601246105919, "grad_norm": 0.6016846895217896, "kl": 0.05065236613154411, "learning_rate": 1.71125e-06, "loss": 0.0569, "num_tokens": 61128960.0, "reward": 1.4518747329711914, "reward_std": 0.06996078789234161, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46377936005592346, "rewards/correct_reward_func/std": 0.1257256120443344, "step": 473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 1696.1429443359375, "completions/mean_terminated_length": 1696.1429443359375, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.7383177570093458, "grad_norm": 0.5974622964859009, "kl": 0.05056299455463886, "learning_rate": 1.710625e-06, "loss": -0.0015, "num_tokens": 61277664.0, "reward": 1.4762911796569824, "reward_std": 0.04775853455066681, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47629112005233765, "rewards/correct_reward_func/std": 0.1305771917104721, "step": 474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 1597.011962890625, "completions/mean_terminated_length": 1597.011962890625, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.7398753894080997, "grad_norm": 0.5798767805099487, "kl": 0.05241680145263672, "learning_rate": 1.71e-06, "loss": 0.001, "num_tokens": 61417813.0, "reward": 1.4604065418243408, "reward_std": 0.07921247184276581, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4604065418243408, "rewards/correct_reward_func/std": 0.12199635803699493, "step": 475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2322.0, "completions/max_terminated_length": 2322.0, "completions/mean_length": 1607.8929443359375, "completions/mean_terminated_length": 1607.8929443359375, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 0.7414330218068536, "grad_norm": 0.5608850717544556, "kl": 0.05262966826558113, "learning_rate": 1.709375e-06, "loss": 0.0272, "num_tokens": 61558840.0, "reward": 1.429826021194458, "reward_std": 0.07498659938573837, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4417307674884796, "rewards/correct_reward_func/std": 0.16307246685028076, "step": 476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2451.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 1651.3929443359375, "completions/mean_terminated_length": 1651.3929443359375, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "epoch": 0.7429906542056075, "grad_norm": 0.5357276201248169, "kl": 0.05072159692645073, "learning_rate": 1.70875e-06, "loss": 0.0321, "num_tokens": 61703707.0, "reward": 1.4157724380493164, "reward_std": 0.10736193507909775, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.43958187103271484, "rewards/correct_reward_func/std": 0.12011130154132843, "step": 477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 1622.2261962890625, "completions/mean_terminated_length": 1543.072265625, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 0.7445482866043613, "grad_norm": 0.5624713897705078, "kl": 0.050551433116197586, "learning_rate": 1.7081249999999998e-06, "loss": 0.0817, "num_tokens": 61845956.0, "reward": 1.4873374700546265, "reward_std": 0.1054694801568985, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4992421567440033, "rewards/correct_reward_func/std": 0.1856101006269455, "step": 478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7608.0, "completions/max_terminated_length": 7608.0, "completions/mean_length": 1654.2738037109375, "completions/mean_terminated_length": 1654.2738037109375, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "epoch": 0.7461059190031153, "grad_norm": 0.5814685821533203, "kl": 0.05036089010536671, "learning_rate": 1.7075e-06, "loss": 0.0464, "num_tokens": 61991041.0, "reward": 1.451461911201477, "reward_std": 0.09037837386131287, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4633665978908539, "rewards/correct_reward_func/std": 0.12058395892381668, "step": 479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2120.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 1494.3214111328125, "completions/mean_terminated_length": 1494.3214111328125, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 0.7476635514018691, "grad_norm": 0.5886862277984619, "kl": 0.050980525091290474, "learning_rate": 1.7068749999999999e-06, "loss": 0.0151, "num_tokens": 62122528.0, "reward": 1.461233139038086, "reward_std": 0.07253991812467575, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47313785552978516, "rewards/correct_reward_func/std": 0.14707376062870026, "step": 480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2365.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 1512.6309814453125, "completions/mean_terminated_length": 1512.6309814453125, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.7492211838006231, "grad_norm": 0.5945307016372681, "kl": 0.0515163391828537, "learning_rate": 1.70625e-06, "loss": 0.0025, "num_tokens": 62255565.0, "reward": 1.5655839443206787, "reward_std": 0.07332275062799454, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5655838251113892, "rewards/correct_reward_func/std": 0.18415912985801697, "step": 481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1462.0357666015625, "completions/mean_terminated_length": 1462.0357666015625, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 0.7507788161993769, "grad_norm": 0.575648307800293, "kl": 0.052306439727544785, "learning_rate": 1.7056249999999999e-06, "loss": -0.0139, "num_tokens": 62384370.0, "reward": 1.5397838354110718, "reward_std": 0.0683126300573349, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.539783775806427, "rewards/correct_reward_func/std": 0.17842523753643036, "step": 482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 1485.84521484375, "completions/mean_terminated_length": 1485.84521484375, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.7523364485981309, "grad_norm": 0.5642300248146057, "kl": 0.05181148275732994, "learning_rate": 1.705e-06, "loss": 0.0319, "num_tokens": 62515067.0, "reward": 1.4557358026504517, "reward_std": 0.11825248599052429, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4795452356338501, "rewards/correct_reward_func/std": 0.16276678442955017, "step": 483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1477.65478515625, "completions/mean_terminated_length": 1477.65478515625, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 0.7538940809968847, "grad_norm": 0.5979113578796387, "kl": 0.05194063484668732, "learning_rate": 1.7043749999999999e-06, "loss": 0.0126, "num_tokens": 62645214.0, "reward": 1.4856928586959839, "reward_std": 0.0904906839132309, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4975975453853607, "rewards/correct_reward_func/std": 0.13653963804244995, "step": 484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 1486.6309814453125, "completions/mean_terminated_length": 1486.6309814453125, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 0.7554517133956387, "grad_norm": 0.6199777722358704, "kl": 0.05140496790409088, "learning_rate": 1.70375e-06, "loss": 0.0157, "num_tokens": 62776049.0, "reward": 1.491317629814148, "reward_std": 0.06931986659765244, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4913175702095032, "rewards/correct_reward_func/std": 0.14081616699695587, "step": 485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 1389.3095703125, "completions/mean_terminated_length": 1389.3095703125, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.7570093457943925, "grad_norm": 0.5803106427192688, "kl": 0.05373929440975189, "learning_rate": 1.7031249999999999e-06, "loss": -0.0063, "num_tokens": 62898547.0, "reward": 1.4282513856887817, "reward_std": 0.06992341578006744, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4282512664794922, "rewards/correct_reward_func/std": 0.14699019491672516, "step": 486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 1418.5357666015625, "completions/mean_terminated_length": 1418.5357666015625, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 0.7585669781931464, "grad_norm": 0.620182454586029, "kl": 0.0546103548258543, "learning_rate": 1.7024999999999998e-06, "loss": -0.0024, "num_tokens": 63023800.0, "reward": 1.4738986492156982, "reward_std": 0.11807496100664139, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4858033359050751, "rewards/correct_reward_func/std": 0.16928018629550934, "step": 487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 1467.21435546875, "completions/mean_terminated_length": 1467.21435546875, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.7601246105919003, "grad_norm": 0.5910825729370117, "kl": 0.0529879629611969, "learning_rate": 1.701875e-06, "loss": 0.0043, "num_tokens": 63153034.0, "reward": 1.4873260259628296, "reward_std": 0.05371030792593956, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4873259365558624, "rewards/correct_reward_func/std": 0.11501560360193253, "step": 488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1416.6429443359375, "completions/mean_terminated_length": 1416.6429443359375, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.7616822429906542, "grad_norm": 0.617638885974884, "kl": 0.05328808352351189, "learning_rate": 1.7012499999999998e-06, "loss": 0.0089, "num_tokens": 63278134.0, "reward": 1.4918674230575562, "reward_std": 0.05592425912618637, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49186742305755615, "rewards/correct_reward_func/std": 0.11877977102994919, "step": 489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1438.6309814453125, "completions/mean_terminated_length": 1438.6309814453125, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.7632398753894081, "grad_norm": 0.5881965160369873, "kl": 0.0523222591727972, "learning_rate": 1.700625e-06, "loss": -0.0096, "num_tokens": 63405033.0, "reward": 1.4045344591140747, "reward_std": 0.07134377211332321, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.41643914580345154, "rewards/correct_reward_func/std": 0.1567194163799286, "step": 490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 1401.0714111328125, "completions/mean_terminated_length": 1401.0714111328125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.764797507788162, "grad_norm": 0.6001453995704651, "kl": 0.05563800781965256, "learning_rate": 1.6999999999999998e-06, "loss": -0.0112, "num_tokens": 63528669.0, "reward": 1.4765801429748535, "reward_std": 0.0643647164106369, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47658008337020874, "rewards/correct_reward_func/std": 0.16985422372817993, "step": 491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1454.0357666015625, "completions/mean_terminated_length": 1372.8553466796875, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 0.7663551401869159, "grad_norm": 0.6344577670097351, "kl": 0.05234198831021786, "learning_rate": 1.699375e-06, "loss": 0.0894, "num_tokens": 63656826.0, "reward": 1.5414061546325684, "reward_std": 0.08393041044473648, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5533110499382019, "rewards/correct_reward_func/std": 0.12277739495038986, "step": 492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1342.84521484375, "completions/mean_terminated_length": 1342.84521484375, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.7679127725856698, "grad_norm": 0.6125787496566772, "kl": 0.05459017679095268, "learning_rate": 1.6987499999999998e-06, "loss": -0.0266, "num_tokens": 63775661.0, "reward": 1.463148832321167, "reward_std": 0.0493975505232811, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46314874291419983, "rewards/correct_reward_func/std": 0.12720361351966858, "step": 493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1410.2261962890625, "completions/mean_terminated_length": 1410.2261962890625, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.7694704049844237, "grad_norm": 0.625629186630249, "kl": 0.05353173241019249, "learning_rate": 1.698125e-06, "loss": 0.0163, "num_tokens": 63900228.0, "reward": 1.5133934020996094, "reward_std": 0.10190200060606003, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5252981781959534, "rewards/correct_reward_func/std": 0.13644695281982422, "step": 494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1312.297607421875, "completions/mean_terminated_length": 1312.297607421875, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.7710280373831776, "grad_norm": 0.6696212887763977, "kl": 0.05529572255909443, "learning_rate": 1.6974999999999998e-06, "loss": -0.0171, "num_tokens": 64016269.0, "reward": 1.464273452758789, "reward_std": 0.07633471488952637, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4642733931541443, "rewards/correct_reward_func/std": 0.1434505134820938, "step": 495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2636.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 1418.7738037109375, "completions/mean_terminated_length": 1418.7738037109375, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.7725856697819314, "grad_norm": 0.6065709590911865, "kl": 0.05418024770915508, "learning_rate": 1.6968749999999997e-06, "loss": -0.03, "num_tokens": 64141350.0, "reward": 1.5015569925308228, "reward_std": 0.0749620795249939, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5015567541122437, "rewards/correct_reward_func/std": 0.1574825942516327, "step": 496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1359.4761962890625, "completions/mean_terminated_length": 1359.4761962890625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.7741433021806854, "grad_norm": 0.6393604278564453, "kl": 0.053761230781674385, "learning_rate": 1.6962499999999999e-06, "loss": -0.0042, "num_tokens": 64261606.0, "reward": 1.5147299766540527, "reward_std": 0.08370784670114517, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5266348719596863, "rewards/correct_reward_func/std": 0.12140747904777527, "step": 497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1397.642822265625, "completions/mean_terminated_length": 1397.642822265625, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.7757009345794392, "grad_norm": 0.5982325673103333, "kl": 0.05507444404065609, "learning_rate": 1.695625e-06, "loss": 0.0236, "num_tokens": 64385140.0, "reward": 1.4505058526992798, "reward_std": 0.06045551598072052, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4505058526992798, "rewards/correct_reward_func/std": 0.16863283514976501, "step": 498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 1413.607177734375, "completions/mean_terminated_length": 1413.607177734375, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.7772585669781932, "grad_norm": 0.6396889686584473, "kl": 0.0517488569021225, "learning_rate": 1.695e-06, "loss": 0.0036, "num_tokens": 64510087.0, "reward": 1.5013822317123413, "reward_std": 0.05566215515136719, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5013821721076965, "rewards/correct_reward_func/std": 0.1984904706478119, "step": 499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1331.75, "completions/mean_terminated_length": 1331.75, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.778816199376947, "grad_norm": 0.6182188391685486, "kl": 0.05334976129233837, "learning_rate": 1.694375e-06, "loss": 0.0177, "num_tokens": 64627834.0, "reward": 1.4577831029891968, "reward_std": 0.07913817465305328, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46968796849250793, "rewards/correct_reward_func/std": 0.1922857016324997, "step": 500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 1450.797607421875, "completions/mean_terminated_length": 1450.797607421875, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.780373831775701, "grad_norm": 0.6251698136329651, "kl": 0.051690295338630676, "learning_rate": 1.69375e-06, "loss": 0.0195, "num_tokens": 64755809.0, "reward": 1.4815540313720703, "reward_std": 0.06203337013721466, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48155394196510315, "rewards/correct_reward_func/std": 0.14881980419158936, "step": 501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2222.0, "completions/max_terminated_length": 2222.0, "completions/mean_length": 1361.0, "completions/mean_terminated_length": 1361.0, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.7819314641744548, "grad_norm": 0.6154322624206543, "kl": 0.05483602173626423, "learning_rate": 1.693125e-06, "loss": -0.0267, "num_tokens": 64876139.0, "reward": 1.434553861618042, "reward_std": 0.1056680679321289, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44645848870277405, "rewards/correct_reward_func/std": 0.15556204319000244, "step": 502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2066.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 1336.75, "completions/mean_terminated_length": 1336.75, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.7834890965732088, "grad_norm": 0.6085004210472107, "kl": 0.05451551079750061, "learning_rate": 1.6924999999999999e-06, "loss": 0.027, "num_tokens": 64994438.0, "reward": 1.5156744718551636, "reward_std": 0.043764952570199966, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5156744718551636, "rewards/correct_reward_func/std": 0.10536623001098633, "step": 503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2154.0, "completions/max_terminated_length": 2154.0, "completions/mean_length": 1349.65478515625, "completions/mean_terminated_length": 1349.65478515625, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.7850467289719626, "grad_norm": 0.6269933581352234, "kl": 0.05425166338682175, "learning_rate": 1.691875e-06, "loss": 0.005, "num_tokens": 65113491.0, "reward": 1.4136245250701904, "reward_std": 0.07355698943138123, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41362443566322327, "rewards/correct_reward_func/std": 0.11681222915649414, "step": 504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6143.0, "completions/max_terminated_length": 6143.0, "completions/mean_length": 1480.0357666015625, "completions/mean_terminated_length": 1480.0357666015625, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 0.7866043613707165, "grad_norm": 0.58085036277771, "kl": 0.05101562291383743, "learning_rate": 1.69125e-06, "loss": 0.0136, "num_tokens": 65243952.0, "reward": 1.4428439140319824, "reward_std": 0.10061752051115036, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45474860072135925, "rewards/correct_reward_func/std": 0.15178316831588745, "step": 505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 1478.357177734375, "completions/mean_terminated_length": 1397.4698486328125, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.7881619937694704, "grad_norm": 0.614782989025116, "kl": 0.05181491747498512, "learning_rate": 1.690625e-06, "loss": 0.0927, "num_tokens": 65374230.0, "reward": 1.5134276151657104, "reward_std": 0.10299229621887207, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5253323912620544, "rewards/correct_reward_func/std": 0.13395552337169647, "step": 506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 1314.7381591796875, "completions/mean_terminated_length": 1314.7381591796875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.7897196261682243, "grad_norm": 0.6423413753509521, "kl": 0.052083175629377365, "learning_rate": 1.69e-06, "loss": -0.0164, "num_tokens": 65490740.0, "reward": 1.4805521965026855, "reward_std": 0.07352635264396667, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4924568831920624, "rewards/correct_reward_func/std": 0.14243850111961365, "step": 507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 1539.2738037109375, "completions/mean_terminated_length": 1459.1204833984375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.7912772585669782, "grad_norm": 0.6012693643569946, "kl": 0.052487269043922424, "learning_rate": 1.689375e-06, "loss": 0.0457, "num_tokens": 65626057.0, "reward": 1.493817687034607, "reward_std": 0.06720510125160217, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49381768703460693, "rewards/correct_reward_func/std": 0.141060933470726, "step": 508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 1402.7261962890625, "completions/mean_terminated_length": 1320.9276123046875, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.7928348909657321, "grad_norm": 0.610801100730896, "kl": 0.053645048290491104, "learning_rate": 1.68875e-06, "loss": 0.0464, "num_tokens": 65749730.0, "reward": 1.4257547855377197, "reward_std": 0.12775346636772156, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.44956421852111816, "rewards/correct_reward_func/std": 0.128119558095932, "step": 509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 1397.3095703125, "completions/mean_terminated_length": 1397.3095703125, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.794392523364486, "grad_norm": 0.6136677265167236, "kl": 0.05390959791839123, "learning_rate": 1.688125e-06, "loss": 0.01, "num_tokens": 65873218.0, "reward": 1.447425127029419, "reward_std": 0.07631354033946991, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4593297839164734, "rewards/correct_reward_func/std": 0.13860400021076202, "step": 510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1334.452392578125, "completions/mean_terminated_length": 1334.452392578125, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 0.7959501557632399, "grad_norm": 0.6296293139457703, "kl": 0.05470990762114525, "learning_rate": 1.6875e-06, "loss": 0.018, "num_tokens": 65991102.0, "reward": 1.4921993017196655, "reward_std": 0.09120924770832062, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5041038990020752, "rewards/correct_reward_func/std": 0.20131434500217438, "step": 511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 1491.107177734375, "completions/mean_terminated_length": 1491.107177734375, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 0.7975077881619937, "grad_norm": 0.6053344011306763, "kl": 0.054690854623913765, "learning_rate": 1.6868749999999998e-06, "loss": 0.0082, "num_tokens": 66122301.0, "reward": 1.5253815650939941, "reward_std": 0.0556306354701519, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5253814458847046, "rewards/correct_reward_func/std": 0.19547365605831146, "step": 512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1394.857177734375, "completions/mean_terminated_length": 1394.857177734375, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.7990654205607477, "grad_norm": 0.6043696403503418, "kl": 0.05226844176650047, "learning_rate": 1.68625e-06, "loss": 0.0163, "num_tokens": 66245343.0, "reward": 1.5012750625610352, "reward_std": 0.07726840674877167, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5012750625610352, "rewards/correct_reward_func/std": 0.17448803782463074, "step": 513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 1384.1785888671875, "completions/mean_terminated_length": 1384.1785888671875, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 0.8006230529595015, "grad_norm": 0.5974183678627014, "kl": 0.055330896750092506, "learning_rate": 1.6856249999999998e-06, "loss": -0.0029, "num_tokens": 66367602.0, "reward": 1.39756441116333, "reward_std": 0.10132217407226562, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.42137381434440613, "rewards/correct_reward_func/std": 0.16763533651828766, "step": 514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 1507.5833740234375, "completions/mean_terminated_length": 1507.5833740234375, "completions/min_length": 941.0, "completions/min_terminated_length": 941.0, "epoch": 0.8021806853582555, "grad_norm": 0.5618709325790405, "kl": 0.05384498089551926, "learning_rate": 1.685e-06, "loss": 0.0189, "num_tokens": 66500467.0, "reward": 1.504148006439209, "reward_std": 0.057246141135692596, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5041479468345642, "rewards/correct_reward_func/std": 0.1694769561290741, "step": 515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3970.0, "completions/max_terminated_length": 3970.0, "completions/mean_length": 1491.3095703125, "completions/mean_terminated_length": 1491.3095703125, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.8037383177570093, "grad_norm": 0.6067160964012146, "kl": 0.05185644514858723, "learning_rate": 1.6843749999999999e-06, "loss": 0.0155, "num_tokens": 66631719.0, "reward": 1.4592036008834839, "reward_std": 0.0799928829073906, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4711082875728607, "rewards/correct_reward_func/std": 0.12382801622152328, "step": 516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1441.6905517578125, "completions/mean_terminated_length": 1441.6905517578125, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.8052959501557633, "grad_norm": 0.5906907916069031, "kl": 0.05282064713537693, "learning_rate": 1.68375e-06, "loss": 0.0161, "num_tokens": 66758611.0, "reward": 1.4480403661727905, "reward_std": 0.07139705866575241, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4599449932575226, "rewards/correct_reward_func/std": 0.18760444223880768, "step": 517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1492.5357666015625, "completions/mean_terminated_length": 1492.5357666015625, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.8068535825545171, "grad_norm": 0.5900249481201172, "kl": 0.05465748719871044, "learning_rate": 1.6831249999999999e-06, "loss": 0.0236, "num_tokens": 66889984.0, "reward": 1.473251223564148, "reward_std": 0.05754239857196808, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4732511639595032, "rewards/correct_reward_func/std": 0.16302239894866943, "step": 518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 1433.2381591796875, "completions/mean_terminated_length": 1433.2381591796875, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.8084112149532711, "grad_norm": 0.6327292323112488, "kl": 0.05179595574736595, "learning_rate": 1.6825e-06, "loss": -0.0127, "num_tokens": 67016334.0, "reward": 1.517919898033142, "reward_std": 0.10886523127555847, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5417292714118958, "rewards/correct_reward_func/std": 0.18844658136367798, "step": 519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 1455.261962890625, "completions/mean_terminated_length": 1455.261962890625, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.8099688473520249, "grad_norm": 0.5824243426322937, "kl": 0.05402742512524128, "learning_rate": 1.6818749999999999e-06, "loss": 0.0003, "num_tokens": 67144684.0, "reward": 1.4728202819824219, "reward_std": 0.07268865406513214, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48472505807876587, "rewards/correct_reward_func/std": 0.1330195963382721, "step": 520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1356.7738037109375, "completions/mean_terminated_length": 1356.7738037109375, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.8115264797507789, "grad_norm": 0.635526716709137, "kl": 0.052402498200535774, "learning_rate": 1.6812499999999998e-06, "loss": -0.0018, "num_tokens": 67264455.0, "reward": 1.4374048709869385, "reward_std": 0.10275428742170334, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44930967688560486, "rewards/correct_reward_func/std": 0.12510575354099274, "step": 521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1410.011962890625, "completions/mean_terminated_length": 1410.011962890625, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 0.8130841121495327, "grad_norm": 0.6527436971664429, "kl": 0.061864860355854034, "learning_rate": 1.680625e-06, "loss": -0.007, "num_tokens": 67388722.0, "reward": 1.4815458059310913, "reward_std": 0.06229028478264809, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.481545627117157, "rewards/correct_reward_func/std": 0.18914058804512024, "step": 522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2120.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 1409.8809814453125, "completions/mean_terminated_length": 1409.8809814453125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8146417445482866, "grad_norm": 0.5648652911186218, "kl": 0.05158809758722782, "learning_rate": 1.6799999999999998e-06, "loss": -0.0055, "num_tokens": 67513224.0, "reward": 1.4905917644500732, "reward_std": 0.0897277295589447, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5024964213371277, "rewards/correct_reward_func/std": 0.1823735535144806, "step": 523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2069.0, "completions/max_terminated_length": 2069.0, "completions/mean_length": 1410.6190185546875, "completions/mean_terminated_length": 1410.6190185546875, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.8161993769470405, "grad_norm": 0.601411759853363, "kl": 0.0548630990087986, "learning_rate": 1.679375e-06, "loss": 0.0227, "num_tokens": 67637728.0, "reward": 1.4363572597503662, "reward_std": 0.08451084047555923, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44826188683509827, "rewards/correct_reward_func/std": 0.17532704770565033, "step": 524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1413.547607421875, "completions/mean_terminated_length": 1413.547607421875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.8177570093457944, "grad_norm": 0.619880735874176, "kl": 0.0535897146910429, "learning_rate": 1.6787499999999998e-06, "loss": 0.0125, "num_tokens": 67762508.0, "reward": 1.4487403631210327, "reward_std": 0.0894000232219696, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46064507961273193, "rewards/correct_reward_func/std": 0.14411649107933044, "step": 525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 1396.892822265625, "completions/mean_terminated_length": 1396.892822265625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.8193146417445483, "grad_norm": 0.5552582144737244, "kl": 0.052618470042943954, "learning_rate": 1.678125e-06, "loss": 0.0085, "num_tokens": 67885985.0, "reward": 1.513651967048645, "reward_std": 0.07783416658639908, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5255565643310547, "rewards/correct_reward_func/std": 0.18098370730876923, "step": 526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 1407.4285888671875, "completions/mean_terminated_length": 1407.4285888671875, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.8208722741433022, "grad_norm": 0.5891075730323792, "kl": 0.05191943235695362, "learning_rate": 1.6774999999999998e-06, "loss": -0.0255, "num_tokens": 68010101.0, "reward": 1.531822919845581, "reward_std": 0.0646144449710846, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5318229794502258, "rewards/correct_reward_func/std": 0.13647998869419098, "step": 527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1330.71435546875, "completions/mean_terminated_length": 1330.71435546875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.822429906542056, "grad_norm": 0.6207326054573059, "kl": 0.05244195647537708, "learning_rate": 1.6768749999999997e-06, "loss": -0.0223, "num_tokens": 68127731.0, "reward": 1.5092251300811768, "reward_std": 0.05685145780444145, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5092251300811768, "rewards/correct_reward_func/std": 0.1656326800584793, "step": 528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 1391.7381591796875, "completions/mean_terminated_length": 1391.7381591796875, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.82398753894081, "grad_norm": 0.5846036076545715, "kl": 0.05157465487718582, "learning_rate": 1.67625e-06, "loss": 0.0272, "num_tokens": 68250595.0, "reward": 1.480483889579773, "reward_std": 0.06157321855425835, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4804837703704834, "rewards/correct_reward_func/std": 0.12125560641288757, "step": 529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1558.21435546875, "completions/mean_terminated_length": 1478.2890625, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.8255451713395638, "grad_norm": 0.5714659094810486, "kl": 0.04993342235684395, "learning_rate": 1.675625e-06, "loss": 0.0461, "num_tokens": 68387617.0, "reward": 1.4617350101470947, "reward_std": 0.062348198145627975, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4617350101470947, "rewards/correct_reward_func/std": 0.1364985555410385, "step": 530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1433.452392578125, "completions/mean_terminated_length": 1433.452392578125, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 0.8271028037383178, "grad_norm": 0.5750571489334106, "kl": 0.05183848738670349, "learning_rate": 1.675e-06, "loss": -0.012, "num_tokens": 68513937.0, "reward": 1.5087705850601196, "reward_std": 0.07611233741044998, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5087705254554749, "rewards/correct_reward_func/std": 0.17911018431186676, "step": 531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1391.5357666015625, "completions/mean_terminated_length": 1391.5357666015625, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.8286604361370716, "grad_norm": 0.6533128023147583, "kl": 0.05436134710907936, "learning_rate": 1.674375e-06, "loss": 0.0264, "num_tokens": 68636850.0, "reward": 1.51654851436615, "reward_std": 0.09789982438087463, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5284532904624939, "rewards/correct_reward_func/std": 0.15198057889938354, "step": 532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 1433.857177734375, "completions/mean_terminated_length": 1433.857177734375, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 0.8302180685358256, "grad_norm": 0.5955455303192139, "kl": 0.055306799709796906, "learning_rate": 1.67375e-06, "loss": -0.0119, "num_tokens": 68763192.0, "reward": 1.379233717918396, "reward_std": 0.04899342358112335, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.37923356890678406, "rewards/correct_reward_func/std": 0.1353388875722885, "step": 533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2564.0, "completions/max_terminated_length": 2564.0, "completions/mean_length": 1472.8095703125, "completions/mean_terminated_length": 1472.8095703125, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.8317757009345794, "grad_norm": 0.5779551267623901, "kl": 0.05130494572222233, "learning_rate": 1.673125e-06, "loss": 0.0082, "num_tokens": 68893040.0, "reward": 1.4051659107208252, "reward_std": 0.049869608134031296, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4051658511161804, "rewards/correct_reward_func/std": 0.14183004200458527, "step": 534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1413.2857666015625, "completions/mean_terminated_length": 1413.2857666015625, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.8333333333333334, "grad_norm": 0.6209075450897217, "kl": 0.05292078107595444, "learning_rate": 1.6725e-06, "loss": -0.0059, "num_tokens": 69017690.0, "reward": 1.5440659523010254, "reward_std": 0.06486238539218903, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5440659523010254, "rewards/correct_reward_func/std": 0.1593974381685257, "step": 535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 1482.3809814453125, "completions/mean_terminated_length": 1482.3809814453125, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 0.8348909657320872, "grad_norm": 0.6003454923629761, "kl": 0.05392787978053093, "learning_rate": 1.671875e-06, "loss": 0.0123, "num_tokens": 69148084.0, "reward": 1.436340093612671, "reward_std": 0.07149424403905869, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4363400638103485, "rewards/correct_reward_func/std": 0.14438651502132416, "step": 536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 1409.46435546875, "completions/mean_terminated_length": 1409.46435546875, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.8364485981308412, "grad_norm": 0.6742749214172363, "kl": 0.053148942068219185, "learning_rate": 1.6712499999999999e-06, "loss": -0.0079, "num_tokens": 69272203.0, "reward": 1.4142444133758545, "reward_std": 0.10058359056711197, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.43805375695228577, "rewards/correct_reward_func/std": 0.17819122970104218, "step": 537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 1534.0833740234375, "completions/mean_terminated_length": 1453.867431640625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.838006230529595, "grad_norm": 0.572339653968811, "kl": 0.051896609365940094, "learning_rate": 1.670625e-06, "loss": 0.0649, "num_tokens": 69406928.0, "reward": 1.4281702041625977, "reward_std": 0.08568203449249268, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44007474184036255, "rewards/correct_reward_func/std": 0.1391279697418213, "step": 538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3857.0, "completions/max_terminated_length": 3857.0, "completions/mean_length": 1528.0, "completions/mean_terminated_length": 1528.0, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.839563862928349, "grad_norm": 0.558670699596405, "kl": 0.05065500736236572, "learning_rate": 1.6699999999999999e-06, "loss": 0.0049, "num_tokens": 69541424.0, "reward": 1.416330337524414, "reward_std": 0.12241604179143906, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4401398301124573, "rewards/correct_reward_func/std": 0.12226840853691101, "step": 539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 1439.107177734375, "completions/mean_terminated_length": 1439.107177734375, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.8411214953271028, "grad_norm": 0.6253755688667297, "kl": 0.05376381799578667, "learning_rate": 1.669375e-06, "loss": 0.0018, "num_tokens": 69668123.0, "reward": 1.4674383401870728, "reward_std": 0.0582718625664711, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4674382507801056, "rewards/correct_reward_func/std": 0.17001482844352722, "step": 540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1451.797607421875, "completions/mean_terminated_length": 1451.797607421875, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 0.8426791277258567, "grad_norm": 0.5441909432411194, "kl": 0.05154600366950035, "learning_rate": 1.66875e-06, "loss": -0.0146, "num_tokens": 69795798.0, "reward": 1.5086621046066284, "reward_std": 0.045781608670949936, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5086619853973389, "rewards/correct_reward_func/std": 0.15602315962314606, "step": 541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 1410.3809814453125, "completions/mean_terminated_length": 1410.3809814453125, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.8442367601246106, "grad_norm": 0.5914663076400757, "kl": 0.053898200392723083, "learning_rate": 1.668125e-06, "loss": -0.02, "num_tokens": 69920132.0, "reward": 1.4829809665679932, "reward_std": 0.05561475455760956, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4829808175563812, "rewards/correct_reward_func/std": 0.14528672397136688, "step": 542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 1559.607177734375, "completions/mean_terminated_length": 1559.607177734375, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 0.8457943925233645, "grad_norm": 0.5803366303443909, "kl": 0.05089765228331089, "learning_rate": 1.6675e-06, "loss": 0.024, "num_tokens": 70057187.0, "reward": 1.5257433652877808, "reward_std": 0.07609397917985916, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.525743305683136, "rewards/correct_reward_func/std": 0.15661990642547607, "step": 543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2234.0, "completions/max_terminated_length": 2234.0, "completions/mean_length": 1478.202392578125, "completions/mean_terminated_length": 1478.202392578125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.8473520249221184, "grad_norm": 0.631567120552063, "kl": 0.051157766953110695, "learning_rate": 1.666875e-06, "loss": -0.0001, "num_tokens": 70187470.0, "reward": 1.4295824766159058, "reward_std": 0.06356283277273178, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42958250641822815, "rewards/correct_reward_func/std": 0.164540097117424, "step": 544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 1515.0238037109375, "completions/mean_terminated_length": 1515.0238037109375, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.8489096573208723, "grad_norm": 0.5900546908378601, "kl": 0.05033543519675732, "learning_rate": 1.66625e-06, "loss": -0.0113, "num_tokens": 70320696.0, "reward": 1.4821819067001343, "reward_std": 0.0709303766489029, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4940865933895111, "rewards/correct_reward_func/std": 0.1473175436258316, "step": 545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 1536.761962890625, "completions/mean_terminated_length": 1536.761962890625, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "epoch": 0.8504672897196262, "grad_norm": 0.5679816007614136, "kl": 0.05010136775672436, "learning_rate": 1.6656249999999998e-06, "loss": -0.0173, "num_tokens": 70455964.0, "reward": 1.5630290508270264, "reward_std": 0.06558456271886826, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5630288124084473, "rewards/correct_reward_func/std": 0.16733138263225555, "step": 546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 1501.6429443359375, "completions/mean_terminated_length": 1501.6429443359375, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "epoch": 0.8520249221183801, "grad_norm": 0.5807083249092102, "kl": 0.05040537752211094, "learning_rate": 1.665e-06, "loss": 0.0092, "num_tokens": 70588168.0, "reward": 1.4890462160110474, "reward_std": 0.050529684871435165, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.489046186208725, "rewards/correct_reward_func/std": 0.11799110472202301, "step": 547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 1504.90478515625, "completions/mean_terminated_length": 1504.90478515625, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.8535825545171339, "grad_norm": 0.5985187292098999, "kl": 0.0527173038572073, "learning_rate": 1.6643749999999998e-06, "loss": 0.0376, "num_tokens": 70720526.0, "reward": 1.4671262502670288, "reward_std": 0.09527470171451569, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4790307879447937, "rewards/correct_reward_func/std": 0.2005491405725479, "step": 548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2135.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 1473.0, "completions/mean_terminated_length": 1473.0, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.8551401869158879, "grad_norm": 0.6216426491737366, "kl": 0.0518038310110569, "learning_rate": 1.66375e-06, "loss": 0.002, "num_tokens": 70850270.0, "reward": 1.4148383140563965, "reward_std": 0.13262715935707092, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4386478662490845, "rewards/correct_reward_func/std": 0.1450170874595642, "step": 549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2297.0, "completions/mean_length": 1617.7738037109375, "completions/mean_terminated_length": 1538.566162109375, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "epoch": 0.8566978193146417, "grad_norm": 0.5692617893218994, "kl": 0.04915030300617218, "learning_rate": 1.6631249999999999e-06, "loss": 0.0763, "num_tokens": 70992001.0, "reward": 1.4859832525253296, "reward_std": 0.08638235181570053, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49788784980773926, "rewards/correct_reward_func/std": 0.1480027735233307, "step": 550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1563.7261962890625, "completions/mean_terminated_length": 1483.867431640625, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "epoch": 0.8582554517133957, "grad_norm": 0.5355828404426575, "kl": 0.04826325178146362, "learning_rate": 1.6625e-06, "loss": 0.0394, "num_tokens": 71129288.0, "reward": 1.4876452684402466, "reward_std": 0.05155961960554123, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4876452386379242, "rewards/correct_reward_func/std": 0.17587290704250336, "step": 551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 1465.4761962890625, "completions/mean_terminated_length": 1465.4761962890625, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 0.8598130841121495, "grad_norm": 0.6027195453643799, "kl": 0.051084551960229874, "learning_rate": 1.6618749999999999e-06, "loss": -0.0266, "num_tokens": 71258208.0, "reward": 1.48594069480896, "reward_std": 0.06709278374910355, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4859406650066376, "rewards/correct_reward_func/std": 0.16938088834285736, "step": 552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4262.0, "completions/max_terminated_length": 4262.0, "completions/mean_length": 1517.666748046875, "completions/mean_terminated_length": 1517.666748046875, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.8613707165109035, "grad_norm": 0.580878734588623, "kl": 0.04972629249095917, "learning_rate": 1.6612499999999998e-06, "loss": -0.0144, "num_tokens": 71391566.0, "reward": 1.4727210998535156, "reward_std": 0.06501049548387527, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4727211892604828, "rewards/correct_reward_func/std": 0.17857278883457184, "step": 553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1515.7857666015625, "completions/mean_terminated_length": 1515.7857666015625, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.8629283489096573, "grad_norm": 0.599119246006012, "kl": 0.050645509734749794, "learning_rate": 1.6606249999999999e-06, "loss": -0.0003, "num_tokens": 71524916.0, "reward": 1.4626071453094482, "reward_std": 0.053476471453905106, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46260714530944824, "rewards/correct_reward_func/std": 0.16813401877880096, "step": 554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5913.0, "completions/max_terminated_length": 5913.0, "completions/mean_length": 1586.047607421875, "completions/mean_terminated_length": 1586.047607421875, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.8644859813084113, "grad_norm": 0.547502338886261, "kl": 0.04839299060404301, "learning_rate": 1.6599999999999998e-06, "loss": 0.0048, "num_tokens": 71664126.0, "reward": 1.568305253982544, "reward_std": 0.08509069681167603, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5802100300788879, "rewards/correct_reward_func/std": 0.1785019338130951, "step": 555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 1738.5238037109375, "completions/mean_terminated_length": 1581.1219482421875, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 0.8660436137071651, "grad_norm": 0.5003688931465149, "kl": 0.047175006940960884, "learning_rate": 1.6593749999999999e-06, "loss": 0.072, "num_tokens": 71816114.0, "reward": 1.487537145614624, "reward_std": 0.078678660094738, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4875370264053345, "rewards/correct_reward_func/std": 0.17857803404331207, "step": 556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 1579.0595703125, "completions/mean_terminated_length": 1579.0595703125, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.867601246105919, "grad_norm": 0.583847165107727, "kl": 0.04886885918676853, "learning_rate": 1.6587499999999998e-06, "loss": 0.0038, "num_tokens": 71954809.0, "reward": 1.4350553750991821, "reward_std": 0.06607770174741745, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43505528569221497, "rewards/correct_reward_func/std": 0.10001393407583237, "step": 557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 1577.0, "completions/mean_terminated_length": 1577.0, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 0.8691588785046729, "grad_norm": 0.5692570805549622, "kl": 0.05144515633583069, "learning_rate": 1.658125e-06, "loss": -0.0243, "num_tokens": 72093283.0, "reward": 1.4471040964126587, "reward_std": 0.06054630130529404, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44710394740104675, "rewards/correct_reward_func/std": 0.1185460090637207, "step": 558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 1629.9285888671875, "completions/mean_terminated_length": 1550.867431640625, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 0.8707165109034268, "grad_norm": 0.5377056002616882, "kl": 0.049677252769470215, "learning_rate": 1.6574999999999998e-06, "loss": 0.0604, "num_tokens": 72236281.0, "reward": 1.4650782346725464, "reward_std": 0.08549048751592636, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4650781750679016, "rewards/correct_reward_func/std": 0.13779646158218384, "step": 559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 1595.09521484375, "completions/mean_terminated_length": 1595.09521484375, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 0.8722741433021807, "grad_norm": 0.5444644689559937, "kl": 0.053564492613077164, "learning_rate": 1.6568750000000001e-06, "loss": -0.0056, "num_tokens": 72376257.0, "reward": 1.464841604232788, "reward_std": 0.07330876588821411, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4648415148258209, "rewards/correct_reward_func/std": 0.13325951993465424, "step": 560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1595.761962890625, "completions/mean_terminated_length": 1595.761962890625, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "epoch": 0.8738317757009346, "grad_norm": 0.5756959915161133, "kl": 0.0488431490957737, "learning_rate": 1.65625e-06, "loss": -0.0016, "num_tokens": 72516427.0, "reward": 1.5044291019439697, "reward_std": 0.05856965854763985, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5044289827346802, "rewards/correct_reward_func/std": 0.16118833422660828, "step": 561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 1523.202392578125, "completions/mean_terminated_length": 1523.202392578125, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.8753894080996885, "grad_norm": 0.6184421181678772, "kl": 0.051321882754564285, "learning_rate": 1.655625e-06, "loss": 0.0021, "num_tokens": 72650334.0, "reward": 1.439370036125183, "reward_std": 0.12014901638031006, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4512746334075928, "rewards/correct_reward_func/std": 0.13923780620098114, "step": 562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 1580.0595703125, "completions/mean_terminated_length": 1580.0595703125, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "epoch": 0.8769470404984424, "grad_norm": 0.5475049018859863, "kl": 0.05060616135597229, "learning_rate": 1.655e-06, "loss": 0.0094, "num_tokens": 72789155.0, "reward": 1.4986872673034668, "reward_std": 0.041443560272455215, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4986870288848877, "rewards/correct_reward_func/std": 0.1334153264760971, "step": 563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 1658.9881591796875, "completions/mean_terminated_length": 1580.277099609375, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.8785046728971962, "grad_norm": 0.5355792045593262, "kl": 0.0508806686848402, "learning_rate": 1.654375e-06, "loss": -0.0215, "num_tokens": 72934552.0, "reward": 1.425096035003662, "reward_std": 0.0804082602262497, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4370007812976837, "rewards/correct_reward_func/std": 0.12526416778564453, "step": 564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 1626.2857666015625, "completions/mean_terminated_length": 1626.2857666015625, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 0.8800623052959502, "grad_norm": 0.5639548897743225, "kl": 0.05000521242618561, "learning_rate": 1.65375e-06, "loss": 0.0054, "num_tokens": 73077196.0, "reward": 1.4919753074645996, "reward_std": 0.06127806007862091, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49197524785995483, "rewards/correct_reward_func/std": 0.1505451202392578, "step": 565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1541.8690185546875, "completions/mean_terminated_length": 1541.8690185546875, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.881619937694704, "grad_norm": 0.5418336987495422, "kl": 0.04926094599068165, "learning_rate": 1.653125e-06, "loss": -0.0078, "num_tokens": 73212797.0, "reward": 1.5520154237747192, "reward_std": 0.06705118715763092, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5520154237747192, "rewards/correct_reward_func/std": 0.22710636258125305, "step": 566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1580.0357666015625, "completions/mean_terminated_length": 1580.0357666015625, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "epoch": 0.883177570093458, "grad_norm": 0.5915489196777344, "kl": 0.05472877249121666, "learning_rate": 1.6525e-06, "loss": -0.0068, "num_tokens": 73351538.0, "reward": 1.450407862663269, "reward_std": 0.10079541802406311, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47421735525131226, "rewards/correct_reward_func/std": 0.12341609597206116, "step": 567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 1513.6905517578125, "completions/mean_terminated_length": 1513.6905517578125, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 0.8847352024922118, "grad_norm": 0.600501537322998, "kl": 0.05078642629086971, "learning_rate": 1.651875e-06, "loss": -0.0143, "num_tokens": 73484532.0, "reward": 1.485834002494812, "reward_std": 0.07049893587827682, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4858340322971344, "rewards/correct_reward_func/std": 0.16757294535636902, "step": 568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3088.0, "completions/max_terminated_length": 3088.0, "completions/mean_length": 1561.09521484375, "completions/mean_terminated_length": 1561.09521484375, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 0.8862928348909658, "grad_norm": 0.641735851764679, "kl": 0.052700335159897804, "learning_rate": 1.65125e-06, "loss": -0.0118, "num_tokens": 73621460.0, "reward": 1.4019627571105957, "reward_std": 0.089509978890419, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4138675034046173, "rewards/correct_reward_func/std": 0.20820264518260956, "step": 569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 1680.702392578125, "completions/mean_terminated_length": 1680.702392578125, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "epoch": 0.8878504672897196, "grad_norm": 0.5752266049385071, "kl": 0.05081222578883171, "learning_rate": 1.650625e-06, "loss": 0.001, "num_tokens": 73768867.0, "reward": 1.428143858909607, "reward_std": 0.08753962814807892, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.440048485994339, "rewards/correct_reward_func/std": 0.14757801592350006, "step": 570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1525.011962890625, "completions/mean_terminated_length": 1525.011962890625, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.8894080996884736, "grad_norm": 0.5790801644325256, "kl": 0.050931330770254135, "learning_rate": 1.6499999999999999e-06, "loss": -0.0014, "num_tokens": 73902932.0, "reward": 1.5280438661575317, "reward_std": 0.07709922641515732, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5399484634399414, "rewards/correct_reward_func/std": 0.19332382082939148, "step": 571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 1548.2857666015625, "completions/mean_terminated_length": 1548.2857666015625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.8909657320872274, "grad_norm": 0.5646366477012634, "kl": 0.05197379179298878, "learning_rate": 1.649375e-06, "loss": 0.0077, "num_tokens": 74038952.0, "reward": 1.399301290512085, "reward_std": 0.0882752537727356, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4112059772014618, "rewards/correct_reward_func/std": 0.13882336020469666, "step": 572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 1620.15478515625, "completions/mean_terminated_length": 1620.15478515625, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "epoch": 0.8925233644859814, "grad_norm": 0.5650038719177246, "kl": 0.0501062236726284, "learning_rate": 1.6487499999999999e-06, "loss": 0.0136, "num_tokens": 74180907.0, "reward": 1.4877279996871948, "reward_std": 0.06202785298228264, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48772794008255005, "rewards/correct_reward_func/std": 0.1389545202255249, "step": 573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1473.297607421875, "completions/mean_terminated_length": 1473.297607421875, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.8940809968847352, "grad_norm": 0.5704150199890137, "kl": 0.05233505181968212, "learning_rate": 1.648125e-06, "loss": -0.0144, "num_tokens": 74310478.0, "reward": 1.524600625038147, "reward_std": 0.10687962174415588, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.536505401134491, "rewards/correct_reward_func/std": 0.17535418272018433, "step": 574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2972.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 1529.8095703125, "completions/mean_terminated_length": 1529.8095703125, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 0.8956386292834891, "grad_norm": 0.583591878414154, "kl": 0.05231664888560772, "learning_rate": 1.6475e-06, "loss": -0.0175, "num_tokens": 74444784.0, "reward": 1.4119700193405151, "reward_std": 0.08519253879785538, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42387470602989197, "rewards/correct_reward_func/std": 0.1410830169916153, "step": 575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 1685.96435546875, "completions/mean_terminated_length": 1607.5782470703125, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.897196261682243, "grad_norm": 0.5724698901176453, "kl": 0.05051821656525135, "learning_rate": 1.646875e-06, "loss": 0.0665, "num_tokens": 74592453.0, "reward": 1.543835997581482, "reward_std": 0.059640318155288696, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5438359379768372, "rewards/correct_reward_func/std": 0.19233205914497375, "step": 576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 1596.0, "completions/mean_terminated_length": 1596.0, "completions/min_length": 1028.0, "completions/min_terminated_length": 1028.0, "epoch": 0.8987538940809969, "grad_norm": 0.5430770516395569, "kl": 0.05068780109286308, "learning_rate": 1.64625e-06, "loss": 0.003, "num_tokens": 74732313.0, "reward": 1.4892923831939697, "reward_std": 0.045956652611494064, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4892924129962921, "rewards/correct_reward_func/std": 0.13337074220180511, "step": 577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 1619.3333740234375, "completions/mean_terminated_length": 1619.3333740234375, "completions/min_length": 1108.0, "completions/min_terminated_length": 1108.0, "epoch": 0.9003115264797508, "grad_norm": 0.5669228434562683, "kl": 0.05153697915375233, "learning_rate": 1.6456249999999998e-06, "loss": -0.0006, "num_tokens": 74874247.0, "reward": 1.4417579174041748, "reward_std": 0.08493451774120331, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.453662633895874, "rewards/correct_reward_func/std": 0.14506934583187103, "step": 578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2490.0, "completions/max_terminated_length": 2490.0, "completions/mean_length": 1587.84521484375, "completions/mean_terminated_length": 1587.84521484375, "completions/min_length": 998.0, "completions/min_terminated_length": 998.0, "epoch": 0.9018691588785047, "grad_norm": 0.5580478310585022, "kl": 0.0523674376308918, "learning_rate": 1.645e-06, "loss": 0.018, "num_tokens": 75013500.0, "reward": 1.4614784717559814, "reward_std": 0.06435896456241608, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46147841215133667, "rewards/correct_reward_func/std": 0.11859949678182602, "step": 579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3307.0, "completions/max_terminated_length": 3307.0, "completions/mean_length": 1637.8214111328125, "completions/mean_terminated_length": 1637.8214111328125, "completions/min_length": 1092.0, "completions/min_terminated_length": 1092.0, "epoch": 0.9034267912772586, "grad_norm": 0.540749192237854, "kl": 0.05217336490750313, "learning_rate": 1.6443749999999998e-06, "loss": 0.005, "num_tokens": 75157203.0, "reward": 1.4611413478851318, "reward_std": 0.10950693488121033, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4849506914615631, "rewards/correct_reward_func/std": 0.16065186262130737, "step": 580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 1549.2261962890625, "completions/mean_terminated_length": 1549.2261962890625, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.9049844236760125, "grad_norm": 0.6054486632347107, "kl": 0.05162344500422478, "learning_rate": 1.64375e-06, "loss": 0.0224, "num_tokens": 75293266.0, "reward": 1.4400663375854492, "reward_std": 0.048954516649246216, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4400663375854492, "rewards/correct_reward_func/std": 0.1594221442937851, "step": 581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 1517.6785888671875, "completions/mean_terminated_length": 1517.6785888671875, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.9065420560747663, "grad_norm": 0.5864580273628235, "kl": 0.052071839570999146, "learning_rate": 1.6431249999999998e-06, "loss": 0.0076, "num_tokens": 75426745.0, "reward": 1.4933863878250122, "reward_std": 0.054050467908382416, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4933864176273346, "rewards/correct_reward_func/std": 0.17375320196151733, "step": 582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 1574.7857666015625, "completions/mean_terminated_length": 1574.7857666015625, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.9080996884735203, "grad_norm": 0.5605589747428894, "kl": 0.05352449230849743, "learning_rate": 1.6425e-06, "loss": -0.0052, "num_tokens": 75565075.0, "reward": 1.488409399986267, "reward_std": 0.11526400595903397, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5122188329696655, "rewards/correct_reward_func/std": 0.14258237183094025, "step": 583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 1574.011962890625, "completions/mean_terminated_length": 1574.011962890625, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 0.9096573208722741, "grad_norm": 0.5379892587661743, "kl": 0.05358175188302994, "learning_rate": 1.6418749999999998e-06, "loss": -0.0107, "num_tokens": 75703280.0, "reward": 1.4819163084030151, "reward_std": 0.11179199814796448, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5057256817817688, "rewards/correct_reward_func/std": 0.16328661143779755, "step": 584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 1526.857177734375, "completions/mean_terminated_length": 1526.857177734375, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "epoch": 0.9112149532710281, "grad_norm": 0.5592092871665955, "kl": 0.052205150946974754, "learning_rate": 1.64125e-06, "loss": -0.0203, "num_tokens": 75837530.0, "reward": 1.4071049690246582, "reward_std": 0.15914778411388397, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.454723984003067, "rewards/correct_reward_func/std": 0.16848695278167725, "step": 585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1569.4881591796875, "completions/mean_terminated_length": 1569.4881591796875, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.9127725856697819, "grad_norm": 0.5686935782432556, "kl": 0.05123456381261349, "learning_rate": 1.6406249999999999e-06, "loss": 0.0122, "num_tokens": 75975445.0, "reward": 1.4490768909454346, "reward_std": 0.06693350523710251, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4490768313407898, "rewards/correct_reward_func/std": 0.16664773225784302, "step": 586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2338.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 1549.047607421875, "completions/mean_terminated_length": 1549.047607421875, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "epoch": 0.9143302180685359, "grad_norm": 0.566740095615387, "kl": 0.05220544897019863, "learning_rate": 1.6399999999999998e-06, "loss": -0.0109, "num_tokens": 76111565.0, "reward": 1.464043378829956, "reward_std": 0.12711408734321594, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4878527820110321, "rewards/correct_reward_func/std": 0.16010543704032898, "step": 587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1619.8214111328125, "completions/mean_terminated_length": 1540.6385498046875, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 0.9158878504672897, "grad_norm": 0.5762239694595337, "kl": 0.052646003663539886, "learning_rate": 1.6393749999999999e-06, "loss": 0.0575, "num_tokens": 76253672.0, "reward": 1.441072702407837, "reward_std": 0.0739678293466568, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44107261300086975, "rewards/correct_reward_func/std": 0.12623170018196106, "step": 588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 1510.1309814453125, "completions/mean_terminated_length": 1510.1309814453125, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "epoch": 0.9174454828660437, "grad_norm": 0.5868603587150574, "kl": 0.05117998085916042, "learning_rate": 1.6387499999999998e-06, "loss": -0.0027, "num_tokens": 76386439.0, "reward": 1.5747073888778687, "reward_std": 0.06914177536964417, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5747074484825134, "rewards/correct_reward_func/std": 0.14264759421348572, "step": 589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2371.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 1601.357177734375, "completions/mean_terminated_length": 1601.357177734375, "completions/min_length": 1061.0, "completions/min_terminated_length": 1061.0, "epoch": 0.9190031152647975, "grad_norm": 0.5775259137153625, "kl": 0.05480557680130005, "learning_rate": 1.6381249999999999e-06, "loss": 0.008, "num_tokens": 76527043.0, "reward": 1.4700545072555542, "reward_std": 0.08241315186023712, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4819592535495758, "rewards/correct_reward_func/std": 0.14215821027755737, "step": 590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 1482.797607421875, "completions/mean_terminated_length": 1482.797607421875, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 0.9205607476635514, "grad_norm": 0.599651575088501, "kl": 0.05351861007511616, "learning_rate": 1.6374999999999998e-06, "loss": -0.0187, "num_tokens": 76657436.0, "reward": 1.5297892093658447, "reward_std": 0.06223803758621216, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5297890901565552, "rewards/correct_reward_func/std": 0.18300145864486694, "step": 591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 1613.2738037109375, "completions/mean_terminated_length": 1613.2738037109375, "completions/min_length": 1017.0, "completions/min_terminated_length": 1017.0, "epoch": 0.9221183800623053, "grad_norm": 0.5727816224098206, "kl": 0.0537562221288681, "learning_rate": 1.636875e-06, "loss": 0.012, "num_tokens": 76798939.0, "reward": 1.5009434223175049, "reward_std": 0.08991846442222595, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5247528553009033, "rewards/correct_reward_func/std": 0.18158107995986938, "step": 592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 1659.5714111328125, "completions/mean_terminated_length": 1580.867431640625, "completions/min_length": 1085.0, "completions/min_terminated_length": 1085.0, "epoch": 0.9236760124610592, "grad_norm": 0.5455081462860107, "kl": 0.05132809653878212, "learning_rate": 1.63625e-06, "loss": 0.0699, "num_tokens": 76944613.0, "reward": 1.4970442056655884, "reward_std": 0.09464308619499207, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5089487433433533, "rewards/correct_reward_func/std": 0.16302646696567535, "step": 593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1536.5238037109375, "completions/mean_terminated_length": 1536.5238037109375, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.9252336448598131, "grad_norm": 0.5868439078330994, "kl": 0.05363127030432224, "learning_rate": 1.6356250000000001e-06, "loss": -0.0331, "num_tokens": 77079573.0, "reward": 1.4771671295166016, "reward_std": 0.05991039052605629, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4771670699119568, "rewards/correct_reward_func/std": 0.1585981547832489, "step": 594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2357.0, "completions/max_terminated_length": 2357.0, "completions/mean_length": 1595.8095703125, "completions/mean_terminated_length": 1595.8095703125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.926791277258567, "grad_norm": 0.5364544987678528, "kl": 0.055180374532938004, "learning_rate": 1.635e-06, "loss": 0.0001, "num_tokens": 77219441.0, "reward": 1.4917380809783936, "reward_std": 0.09097646921873093, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4917379915714264, "rewards/correct_reward_func/std": 0.16991373896598816, "step": 595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 1606.0833740234375, "completions/mean_terminated_length": 1606.0833740234375, "completions/min_length": 1030.0, "completions/min_terminated_length": 1030.0, "epoch": 0.9283489096573209, "grad_norm": 0.5445214509963989, "kl": 0.05271473526954651, "learning_rate": 1.634375e-06, "loss": -0.023, "num_tokens": 77360292.0, "reward": 1.4440690279006958, "reward_std": 0.048418521881103516, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44406890869140625, "rewards/correct_reward_func/std": 0.16575324535369873, "step": 596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 1495.71435546875, "completions/mean_terminated_length": 1495.71435546875, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "epoch": 0.9299065420560748, "grad_norm": 0.5676112771034241, "kl": 0.05709120258688927, "learning_rate": 1.63375e-06, "loss": -0.0011, "num_tokens": 77491650.0, "reward": 1.5036193132400513, "reward_std": 0.07075376063585281, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5036192536354065, "rewards/correct_reward_func/std": 0.15129926800727844, "step": 597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 1607.1785888671875, "completions/mean_terminated_length": 1607.1785888671875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9314641744548287, "grad_norm": 0.5542066097259521, "kl": 0.05464941821992397, "learning_rate": 1.633125e-06, "loss": 0.0103, "num_tokens": 77632623.0, "reward": 1.4994677305221558, "reward_std": 0.09316570311784744, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.511372447013855, "rewards/correct_reward_func/std": 0.10848263651132584, "step": 598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 1543.1309814453125, "completions/mean_terminated_length": 1543.1309814453125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.9330218068535826, "grad_norm": 0.5920741558074951, "kl": 0.05645397678017616, "learning_rate": 1.6325e-06, "loss": 0.0179, "num_tokens": 77768348.0, "reward": 1.4137533903121948, "reward_std": 0.130188450217247, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.44946759939193726, "rewards/correct_reward_func/std": 0.1394752562046051, "step": 599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1543.8214111328125, "completions/mean_terminated_length": 1543.8214111328125, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "epoch": 0.9345794392523364, "grad_norm": 0.629948079586029, "kl": 0.05816573277115822, "learning_rate": 1.631875e-06, "loss": 0.008, "num_tokens": 77903795.0, "reward": 1.4406551122665405, "reward_std": 0.08957747370004654, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44065502285957336, "rewards/correct_reward_func/std": 0.16707439720630646, "step": 600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 1753.5714111328125, "completions/mean_terminated_length": 1675.9998779296875, "completions/min_length": 1085.0, "completions/min_terminated_length": 1085.0, "epoch": 0.9361370716510904, "grad_norm": 0.596502959728241, "kl": 0.05688577890396118, "learning_rate": 1.63125e-06, "loss": 0.0596, "num_tokens": 78057203.0, "reward": 1.4966117143630981, "reward_std": 0.08040610700845718, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4966115951538086, "rewards/correct_reward_func/std": 0.22688382863998413, "step": 601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3153.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 1590.25, "completions/mean_terminated_length": 1590.25, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 0.9376947040498442, "grad_norm": 0.5286832451820374, "kl": 0.05895489826798439, "learning_rate": 1.630625e-06, "loss": -0.0303, "num_tokens": 78196856.0, "reward": 1.4874906539916992, "reward_std": 0.044962868094444275, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48749059438705444, "rewards/correct_reward_func/std": 0.1546032726764679, "step": 602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2431.0, "completions/max_terminated_length": 2431.0, "completions/mean_length": 1600.21435546875, "completions/mean_terminated_length": 1600.21435546875, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.9392523364485982, "grad_norm": 0.5734366178512573, "kl": 0.05501212365925312, "learning_rate": 1.6299999999999999e-06, "loss": 0.0089, "num_tokens": 78337274.0, "reward": 1.5072236061096191, "reward_std": 0.05490497127175331, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5072234869003296, "rewards/correct_reward_func/std": 0.11040540784597397, "step": 603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 1633.90478515625, "completions/mean_terminated_length": 1633.90478515625, "completions/min_length": 1092.0, "completions/min_terminated_length": 1092.0, "epoch": 0.940809968847352, "grad_norm": 0.5251239538192749, "kl": 0.059018656611442566, "learning_rate": 1.629375e-06, "loss": -0.0014, "num_tokens": 78480546.0, "reward": 1.4861642122268677, "reward_std": 0.06594221293926239, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4980688989162445, "rewards/correct_reward_func/std": 0.16121798753738403, "step": 604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 1652.1309814453125, "completions/mean_terminated_length": 1652.1309814453125, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.942367601246106, "grad_norm": 0.5935388803482056, "kl": 0.051552364602684975, "learning_rate": 1.6287499999999999e-06, "loss": 0.0244, "num_tokens": 78625445.0, "reward": 1.4352984428405762, "reward_std": 0.13022708892822266, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.47101256251335144, "rewards/correct_reward_func/std": 0.1250392645597458, "step": 605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3299.0, "completions/mean_length": 1707.4405517578125, "completions/mean_terminated_length": 1629.313232421875, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "epoch": 0.9439252336448598, "grad_norm": 0.6000062227249146, "kl": 0.054089561104774475, "learning_rate": 1.628125e-06, "loss": 0.0647, "num_tokens": 78774852.0, "reward": 1.4837253093719482, "reward_std": 0.07214730232954025, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4837252199649811, "rewards/correct_reward_func/std": 0.1362723410129547, "step": 606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 1669.357177734375, "completions/mean_terminated_length": 1590.77099609375, "completions/min_length": 1059.0, "completions/min_terminated_length": 1059.0, "epoch": 0.9454828660436138, "grad_norm": 0.5420730113983154, "kl": 0.051561569795012474, "learning_rate": 1.6274999999999999e-06, "loss": 0.0154, "num_tokens": 78921096.0, "reward": 1.472241759300232, "reward_std": 0.10751719772815704, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4841463565826416, "rewards/correct_reward_func/std": 0.1544354408979416, "step": 607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 1506.25, "completions/mean_terminated_length": 1506.25, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 0.9470404984423676, "grad_norm": 0.6013126373291016, "kl": 0.05470665171742439, "learning_rate": 1.626875e-06, "loss": 0.0038, "num_tokens": 79053429.0, "reward": 1.5111743211746216, "reward_std": 0.06690473109483719, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5111743211746216, "rewards/correct_reward_func/std": 0.18495941162109375, "step": 608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1583.166748046875, "completions/mean_terminated_length": 1503.5421142578125, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 0.9485981308411215, "grad_norm": 0.5386534929275513, "kl": 0.051652608439326286, "learning_rate": 1.6262499999999999e-06, "loss": 0.056, "num_tokens": 79192355.0, "reward": 1.4510307312011719, "reward_std": 0.0934084877371788, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4629353880882263, "rewards/correct_reward_func/std": 0.14852337539196014, "step": 609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 1621.202392578125, "completions/mean_terminated_length": 1621.202392578125, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.9501557632398754, "grad_norm": 0.6040880680084229, "kl": 0.053494108840823174, "learning_rate": 1.625625e-06, "loss": -0.0353, "num_tokens": 79334620.0, "reward": 1.4715975522994995, "reward_std": 0.0786285549402237, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4715975821018219, "rewards/correct_reward_func/std": 0.11298926919698715, "step": 610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 1519.4285888671875, "completions/mean_terminated_length": 1519.4285888671875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.9517133956386293, "grad_norm": 0.5844832062721252, "kl": 0.0530572235584259, "learning_rate": 1.625e-06, "loss": 0.0093, "num_tokens": 79468030.0, "reward": 1.5320613384246826, "reward_std": 0.05157333239912987, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5320611596107483, "rewards/correct_reward_func/std": 0.14916525781154633, "step": 611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1606.5, "completions/mean_terminated_length": 1606.5, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 0.9532710280373832, "grad_norm": 0.5951868295669556, "kl": 0.05439838580787182, "learning_rate": 1.6243749999999998e-06, "loss": -0.0238, "num_tokens": 79608940.0, "reward": 1.4870160818099976, "reward_std": 0.1107514277100563, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4989207088947296, "rewards/correct_reward_func/std": 0.1661691665649414, "step": 612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 1537.761962890625, "completions/mean_terminated_length": 1537.761962890625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.9548286604361371, "grad_norm": 0.5835762023925781, "kl": 0.054533904418349266, "learning_rate": 1.62375e-06, "loss": 0.0028, "num_tokens": 79744142.0, "reward": 1.4982998371124268, "reward_std": 0.08925885707139969, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5102044939994812, "rewards/correct_reward_func/std": 0.21719758212566376, "step": 613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 1523.2261962890625, "completions/mean_terminated_length": 1523.2261962890625, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 0.956386292834891, "grad_norm": 0.5852372646331787, "kl": 0.05277659185230732, "learning_rate": 1.6231249999999998e-06, "loss": -0.0142, "num_tokens": 79878015.0, "reward": 1.458331823348999, "reward_std": 0.0705060064792633, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4583317041397095, "rewards/correct_reward_func/std": 0.1471869796514511, "step": 614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1595.261962890625, "completions/mean_terminated_length": 1595.261962890625, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "epoch": 0.9579439252336449, "grad_norm": 0.5480629801750183, "kl": 0.0494478065520525, "learning_rate": 1.6225e-06, "loss": -0.0132, "num_tokens": 80017915.0, "reward": 1.4836761951446533, "reward_std": 0.06662869453430176, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4836761951446533, "rewards/correct_reward_func/std": 0.1571483314037323, "step": 615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 1565.5238037109375, "completions/mean_terminated_length": 1565.5238037109375, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 0.9595015576323987, "grad_norm": 0.5563946962356567, "kl": 0.05082782916724682, "learning_rate": 1.6218749999999998e-06, "loss": 0.0008, "num_tokens": 80155491.0, "reward": 1.5200374126434326, "reward_std": 0.0669645443558693, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5200372934341431, "rewards/correct_reward_func/std": 0.18864794075489044, "step": 616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1500.7381591796875, "completions/mean_terminated_length": 1500.7381591796875, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.9610591900311527, "grad_norm": 0.5845383405685425, "kl": 0.051540493965148926, "learning_rate": 1.62125e-06, "loss": -0.0132, "num_tokens": 80287547.0, "reward": 1.5222618579864502, "reward_std": 0.08891423046588898, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5341665148735046, "rewards/correct_reward_func/std": 0.1643657386302948, "step": 617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 1515.0357666015625, "completions/mean_terminated_length": 1515.0357666015625, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.9626168224299065, "grad_norm": 0.6092395782470703, "kl": 0.05296482518315315, "learning_rate": 1.6206249999999998e-06, "loss": 0.0086, "num_tokens": 80420648.0, "reward": 1.4399313926696777, "reward_std": 0.08079881966114044, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43993115425109863, "rewards/correct_reward_func/std": 0.13497735559940338, "step": 618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 1513.666748046875, "completions/mean_terminated_length": 1513.666748046875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.9641744548286605, "grad_norm": 0.5746808052062988, "kl": 0.05054004117846489, "learning_rate": 1.62e-06, "loss": -0.0325, "num_tokens": 80553754.0, "reward": 1.478222370147705, "reward_std": 0.05955832451581955, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4782223105430603, "rewards/correct_reward_func/std": 0.13707558810710907, "step": 619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3365.0, "completions/max_terminated_length": 3365.0, "completions/mean_length": 1535.46435546875, "completions/mean_terminated_length": 1535.46435546875, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 0.9657320872274143, "grad_norm": 0.6188405752182007, "kl": 0.05118212662637234, "learning_rate": 1.6193749999999998e-06, "loss": 0.068, "num_tokens": 80688679.0, "reward": 1.4972518682479858, "reward_std": 0.10318266600370407, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.521061360836029, "rewards/correct_reward_func/std": 0.15406495332717896, "step": 620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 1599.0357666015625, "completions/mean_terminated_length": 1519.602294921875, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 0.9672897196261683, "grad_norm": 0.5300514698028564, "kl": 0.04921773634850979, "learning_rate": 1.6187499999999997e-06, "loss": 0.0581, "num_tokens": 80829022.0, "reward": 1.4415113925933838, "reward_std": 0.08664444088935852, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44151124358177185, "rewards/correct_reward_func/std": 0.12956391274929047, "step": 621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 1577.96435546875, "completions/mean_terminated_length": 1498.277099609375, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.9688473520249221, "grad_norm": 0.585616409778595, "kl": 0.0514204315841198, "learning_rate": 1.6181249999999999e-06, "loss": 0.0696, "num_tokens": 80967613.0, "reward": 1.4526182413101196, "reward_std": 0.07754890620708466, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45261818170547485, "rewards/correct_reward_func/std": 0.14640595018863678, "step": 622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1603.0714111328125, "completions/mean_terminated_length": 1523.6866455078125, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.9704049844236761, "grad_norm": 0.6005199551582336, "kl": 0.05098399519920349, "learning_rate": 1.6174999999999998e-06, "loss": 0.0495, "num_tokens": 81108259.0, "reward": 1.4635608196258545, "reward_std": 0.09107384085655212, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4754655957221985, "rewards/correct_reward_func/std": 0.12346034497022629, "step": 623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 1519.261962890625, "completions/mean_terminated_length": 1519.261962890625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.9719626168224299, "grad_norm": 0.6062273383140564, "kl": 0.049668088555336, "learning_rate": 1.616875e-06, "loss": 0.031, "num_tokens": 81241739.0, "reward": 1.4660530090332031, "reward_std": 0.062135469168424606, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4660530090332031, "rewards/correct_reward_func/std": 0.12143100053071976, "step": 624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2461.0, "completions/max_terminated_length": 2461.0, "completions/mean_length": 1491.5714111328125, "completions/mean_terminated_length": 1491.5714111328125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.9735202492211839, "grad_norm": 0.6340486407279968, "kl": 0.05301540531218052, "learning_rate": 1.61625e-06, "loss": -0.0211, "num_tokens": 81372875.0, "reward": 1.487614393234253, "reward_std": 0.0698120966553688, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48761430382728577, "rewards/correct_reward_func/std": 0.13601452112197876, "step": 625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1590.8929443359375, "completions/mean_terminated_length": 1590.8929443359375, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 0.9750778816199377, "grad_norm": 0.5444204807281494, "kl": 0.0505395382642746, "learning_rate": 1.615625e-06, "loss": 0.0167, "num_tokens": 81512804.0, "reward": 1.4724253416061401, "reward_std": 0.05580959469079971, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47242528200149536, "rewards/correct_reward_func/std": 0.12042959779500961, "step": 626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 1554.75, "completions/mean_terminated_length": 1554.75, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.9766355140186916, "grad_norm": 0.5865889191627502, "kl": 0.05015707015991211, "learning_rate": 1.615e-06, "loss": 0.004, "num_tokens": 81649517.0, "reward": 1.4877718687057495, "reward_std": 0.08064839243888855, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4996766448020935, "rewards/correct_reward_func/std": 0.1834038645029068, "step": 627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 1443.4761962890625, "completions/mean_terminated_length": 1443.4761962890625, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.9781931464174455, "grad_norm": 0.5513234734535217, "kl": 0.054139742627739906, "learning_rate": 1.614375e-06, "loss": 0.006, "num_tokens": 81776583.0, "reward": 1.5102035999298096, "reward_std": 0.05765219032764435, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5102035999298096, "rewards/correct_reward_func/std": 0.1545766443014145, "step": 628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 1546.21435546875, "completions/mean_terminated_length": 1546.21435546875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.9797507788161994, "grad_norm": 0.592833936214447, "kl": 0.05099848657846451, "learning_rate": 1.61375e-06, "loss": 0.0172, "num_tokens": 81912435.0, "reward": 1.4125304222106934, "reward_std": 0.08528114855289459, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.424435019493103, "rewards/correct_reward_func/std": 0.11915619671344757, "step": 629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1624.5238037109375, "completions/mean_terminated_length": 1545.3975830078125, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "epoch": 0.9813084112149533, "grad_norm": 0.5546726584434509, "kl": 0.05285438522696495, "learning_rate": 1.613125e-06, "loss": 0.0583, "num_tokens": 82054865.0, "reward": 1.5317888259887695, "reward_std": 0.12638387084007263, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5436934232711792, "rewards/correct_reward_func/std": 0.1404784917831421, "step": 630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3120.0, "completions/max_terminated_length": 3120.0, "completions/mean_length": 1527.8095703125, "completions/mean_terminated_length": 1527.8095703125, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "epoch": 0.9828660436137072, "grad_norm": 0.5793138146400452, "kl": 0.052825529128313065, "learning_rate": 1.6125e-06, "loss": 0.0209, "num_tokens": 82189249.0, "reward": 1.470069408416748, "reward_std": 0.07235594838857651, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4700692892074585, "rewards/correct_reward_func/std": 0.13610175251960754, "step": 631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 1610.011962890625, "completions/mean_terminated_length": 1530.7108154296875, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.9844236760124611, "grad_norm": 0.5624516606330872, "kl": 0.0496527124196291, "learning_rate": 1.611875e-06, "loss": 0.0426, "num_tokens": 82330616.0, "reward": 1.531969428062439, "reward_std": 0.10485806316137314, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.543874204158783, "rewards/correct_reward_func/std": 0.16279828548431396, "step": 632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2136.0, "completions/max_terminated_length": 2136.0, "completions/mean_length": 1469.1190185546875, "completions/mean_terminated_length": 1469.1190185546875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.985981308411215, "grad_norm": 0.5987145304679871, "kl": 0.052102504298090935, "learning_rate": 1.61125e-06, "loss": 0.019, "num_tokens": 82459950.0, "reward": 1.5328919887542725, "reward_std": 0.08720895648002625, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5447967648506165, "rewards/correct_reward_func/std": 0.17077518999576569, "step": 633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1462.15478515625, "completions/mean_terminated_length": 1462.15478515625, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 0.9875389408099688, "grad_norm": 0.5946549773216248, "kl": 0.05218057334423065, "learning_rate": 1.610625e-06, "loss": -0.0021, "num_tokens": 82588483.0, "reward": 1.4602668285369873, "reward_std": 0.06267713755369186, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46026673913002014, "rewards/correct_reward_func/std": 0.11811360716819763, "step": 634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 1475.15478515625, "completions/mean_terminated_length": 1475.15478515625, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.9890965732087228, "grad_norm": 0.6137917041778564, "kl": 0.05290712043642998, "learning_rate": 1.61e-06, "loss": 0.0123, "num_tokens": 82718324.0, "reward": 1.4867254495620728, "reward_std": 0.060699447989463806, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.486725389957428, "rewards/correct_reward_func/std": 0.15401968359947205, "step": 635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3418.0, "completions/mean_length": 1539.71435546875, "completions/mean_terminated_length": 1459.566162109375, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.9906542056074766, "grad_norm": 0.5406956672668457, "kl": 0.05111967213451862, "learning_rate": 1.609375e-06, "loss": 0.0556, "num_tokens": 82853534.0, "reward": 1.5321414470672607, "reward_std": 0.08517434448003769, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.532141387462616, "rewards/correct_reward_func/std": 0.1582731455564499, "step": 636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 1444.59521484375, "completions/mean_terminated_length": 1444.59521484375, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.9922118380062306, "grad_norm": 0.5991398096084595, "kl": 0.04915725626051426, "learning_rate": 1.6087499999999998e-06, "loss": 0.0246, "num_tokens": 82980928.0, "reward": 1.5238535404205322, "reward_std": 0.04892566055059433, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5238535404205322, "rewards/correct_reward_func/std": 0.12992213666439056, "step": 637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1405.952392578125, "completions/mean_terminated_length": 1405.952392578125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.9937694704049844, "grad_norm": 0.6216338872909546, "kl": 0.052867574617266655, "learning_rate": 1.608125e-06, "loss": 0.0101, "num_tokens": 83104848.0, "reward": 1.5133877992630005, "reward_std": 0.07994896173477173, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5133876204490662, "rewards/correct_reward_func/std": 0.1903848499059677, "step": 638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 1475.5833740234375, "completions/mean_terminated_length": 1475.5833740234375, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 0.9953271028037384, "grad_norm": 0.5925660133361816, "kl": 0.05206291750073433, "learning_rate": 1.6074999999999999e-06, "loss": 0.0255, "num_tokens": 83234737.0, "reward": 1.497523307800293, "reward_std": 0.0626191571354866, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4975232183933258, "rewards/correct_reward_func/std": 0.18619437515735626, "step": 639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5409.0, "completions/max_terminated_length": 5409.0, "completions/mean_length": 1527.107177734375, "completions/mean_terminated_length": 1527.107177734375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9968847352024922, "grad_norm": 0.6247422695159912, "kl": 0.04974444583058357, "learning_rate": 1.606875e-06, "loss": -0.0138, "num_tokens": 83369086.0, "reward": 1.4803240299224854, "reward_std": 0.07588109374046326, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4922286570072174, "rewards/correct_reward_func/std": 0.15059354901313782, "step": 640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1502.4761962890625, "completions/mean_terminated_length": 1502.4761962890625, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 0.9984423676012462, "grad_norm": 0.5777775645256042, "kl": 0.050743360072374344, "learning_rate": 1.6062499999999999e-06, "loss": -0.0207, "num_tokens": 83501378.0, "reward": 1.5227001905441284, "reward_std": 0.05805504322052002, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5227001309394836, "rewards/correct_reward_func/std": 0.12714529037475586, "step": 641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2487.0, "completions/max_terminated_length": 2487.0, "completions/mean_length": 1441.857177734375, "completions/mean_terminated_length": 1441.857177734375, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 1.0, "grad_norm": 0.6082573533058167, "kl": 0.05472211726009846, "learning_rate": 1.605625e-06, "loss": 0.0021, "num_tokens": 83628314.0, "reward": 1.4580137729644775, "reward_std": 0.11369025707244873, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4818231463432312, "rewards/correct_reward_func/std": 0.1592247188091278, "step": 642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 1665.71435546875, "completions/mean_terminated_length": 1587.084228515625, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "epoch": 1.0015576323987538, "grad_norm": 0.5756648778915405, "kl": 0.05213580280542374, "learning_rate": 1.6049999999999999e-06, "loss": 0.0548, "num_tokens": 83774564.0, "reward": 1.4815843105316162, "reward_std": 0.07832953333854675, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48158419132232666, "rewards/correct_reward_func/std": 0.12412244081497192, "step": 643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1476.797607421875, "completions/mean_terminated_length": 1476.797607421875, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 1.0031152647975077, "grad_norm": 0.594108521938324, "kl": 0.05231509543955326, "learning_rate": 1.604375e-06, "loss": 0.0001, "num_tokens": 83904741.0, "reward": 1.5084398984909058, "reward_std": 0.08493813127279282, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5203444957733154, "rewards/correct_reward_func/std": 0.11909134685993195, "step": 644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 1510.357177734375, "completions/mean_terminated_length": 1510.357177734375, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 1.0046728971962617, "grad_norm": 0.6033718585968018, "kl": 0.05159814655780792, "learning_rate": 1.6037499999999999e-06, "loss": -0.0043, "num_tokens": 84037497.0, "reward": 1.5596506595611572, "reward_std": 0.06282084435224533, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5596506595611572, "rewards/correct_reward_func/std": 0.18580923974514008, "step": 645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 1567.84521484375, "completions/mean_terminated_length": 1567.84521484375, "completions/min_length": 1075.0, "completions/min_terminated_length": 1075.0, "epoch": 1.0062305295950156, "grad_norm": 0.5789066553115845, "kl": 0.052623504772782326, "learning_rate": 1.6031249999999998e-06, "loss": 0.0338, "num_tokens": 84175148.0, "reward": 1.5069093704223633, "reward_std": 0.08146540820598602, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5188140869140625, "rewards/correct_reward_func/std": 0.13371284306049347, "step": 646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2175.0, "completions/max_terminated_length": 2175.0, "completions/mean_length": 1484.5, "completions/mean_terminated_length": 1484.5, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 1.0077881619937694, "grad_norm": 0.5949665307998657, "kl": 0.05212471820414066, "learning_rate": 1.6025e-06, "loss": 0.0168, "num_tokens": 84305870.0, "reward": 1.5230427980422974, "reward_std": 0.049363430589437485, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5230426788330078, "rewards/correct_reward_func/std": 0.12559856474399567, "step": 647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 3188.0, "completions/mean_length": 1677.916748046875, "completions/mean_terminated_length": 1519.0364990234375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 1.0093457943925233, "grad_norm": 0.5194585919380188, "kl": 0.04932490363717079, "learning_rate": 1.6018749999999998e-06, "loss": 0.0928, "num_tokens": 84452887.0, "reward": 1.4853767156600952, "reward_std": 0.08370485156774521, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4853765368461609, "rewards/correct_reward_func/std": 0.16985315084457397, "step": 648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 1499.5714111328125, "completions/mean_terminated_length": 1499.5714111328125, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 1.0109034267912773, "grad_norm": 0.6075387597084045, "kl": 0.05233858525753021, "learning_rate": 1.60125e-06, "loss": 0.0018, "num_tokens": 84585019.0, "reward": 1.551127314567566, "reward_std": 0.05537908524274826, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5511272549629211, "rewards/correct_reward_func/std": 0.14352430403232574, "step": 649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 1476.90478515625, "completions/mean_terminated_length": 1476.90478515625, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 1.0124610591900312, "grad_norm": 0.5924170017242432, "kl": 0.05188839137554169, "learning_rate": 1.6006249999999998e-06, "loss": 0.0245, "num_tokens": 84714923.0, "reward": 1.5396331548690796, "reward_std": 0.06312854588031769, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5396330952644348, "rewards/correct_reward_func/std": 0.1734156459569931, "step": 650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 1465.4881591796875, "completions/mean_terminated_length": 1465.4881591796875, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 1.014018691588785, "grad_norm": 0.597648024559021, "kl": 0.05065512843430042, "learning_rate": 1.6e-06, "loss": 0.0192, "num_tokens": 84844012.0, "reward": 1.5225285291671753, "reward_std": 0.05072065815329552, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5225285291671753, "rewards/correct_reward_func/std": 0.17245061695575714, "step": 651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 1471.21435546875, "completions/mean_terminated_length": 1471.21435546875, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 1.0155763239875388, "grad_norm": 0.5615136623382568, "kl": 0.05269411765038967, "learning_rate": 1.5993749999999998e-06, "loss": -0.0099, "num_tokens": 84973576.0, "reward": 1.488713026046753, "reward_std": 0.05458061024546623, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48871293663978577, "rewards/correct_reward_func/std": 0.1673632711172104, "step": 652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3245.0, "completions/max_terminated_length": 3245.0, "completions/mean_length": 1455.2738037109375, "completions/mean_terminated_length": 1455.2738037109375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 1.017133956386293, "grad_norm": 0.5472370386123657, "kl": 0.05032069608569145, "learning_rate": 1.5987499999999997e-06, "loss": -0.0272, "num_tokens": 85101609.0, "reward": 1.532504916191101, "reward_std": 0.04785650223493576, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5325048565864563, "rewards/correct_reward_func/std": 0.19464264810085297, "step": 653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1474.0714111328125, "completions/mean_terminated_length": 1474.0714111328125, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.0186915887850467, "grad_norm": 0.5767446756362915, "kl": 0.05521121807396412, "learning_rate": 1.5981249999999998e-06, "loss": -0.0144, "num_tokens": 85231431.0, "reward": 1.4579790830612183, "reward_std": 0.09587103873491287, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46988385915756226, "rewards/correct_reward_func/std": 0.13316239416599274, "step": 654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 1522.8333740234375, "completions/mean_terminated_length": 1522.8333740234375, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 1.0202492211838006, "grad_norm": 0.5800827741622925, "kl": 0.05396328121423721, "learning_rate": 1.5975e-06, "loss": 0.0364, "num_tokens": 85365211.0, "reward": 1.501022219657898, "reward_std": 0.076015904545784, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5010221600532532, "rewards/correct_reward_func/std": 0.15128661692142487, "step": 655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 1491.1429443359375, "completions/mean_terminated_length": 1491.1429443359375, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 1.0218068535825544, "grad_norm": 0.5550655126571655, "kl": 0.05126112699508667, "learning_rate": 1.596875e-06, "loss": 0.0014, "num_tokens": 85496383.0, "reward": 1.4312056303024292, "reward_std": 0.08455885946750641, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44311028718948364, "rewards/correct_reward_func/std": 0.12594355642795563, "step": 656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2711.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 1563.452392578125, "completions/mean_terminated_length": 1563.452392578125, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "epoch": 1.0233644859813085, "grad_norm": 0.5272036194801331, "kl": 0.05299381539225578, "learning_rate": 1.59625e-06, "loss": -0.0094, "num_tokens": 85633941.0, "reward": 1.4453727006912231, "reward_std": 0.060865722596645355, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44537264108657837, "rewards/correct_reward_func/std": 0.12575693428516388, "step": 657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 1494.107177734375, "completions/mean_terminated_length": 1494.107177734375, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 1.0249221183800623, "grad_norm": 0.629683792591095, "kl": 0.053053101524710655, "learning_rate": 1.595625e-06, "loss": -0.0115, "num_tokens": 85765350.0, "reward": 1.4953358173370361, "reward_std": 0.06038458272814751, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49533572793006897, "rewards/correct_reward_func/std": 0.1519479900598526, "step": 658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 1514.0833740234375, "completions/mean_terminated_length": 1514.0833740234375, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 1.0264797507788161, "grad_norm": 0.5846103429794312, "kl": 0.05215497314929962, "learning_rate": 1.595e-06, "loss": -0.0266, "num_tokens": 85898593.0, "reward": 1.5464462041854858, "reward_std": 0.07301543653011322, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5464460253715515, "rewards/correct_reward_func/std": 0.11028709262609482, "step": 659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 1448.4405517578125, "completions/mean_terminated_length": 1448.4405517578125, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 1.02803738317757, "grad_norm": 0.6190741658210754, "kl": 0.052393680438399315, "learning_rate": 1.594375e-06, "loss": -0.0017, "num_tokens": 86026226.0, "reward": 1.4759933948516846, "reward_std": 0.04937182739377022, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47599339485168457, "rewards/correct_reward_func/std": 0.13999196887016296, "step": 660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 1479.8809814453125, "completions/mean_terminated_length": 1479.8809814453125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 1.029595015576324, "grad_norm": 0.5660926699638367, "kl": 0.05401911213994026, "learning_rate": 1.59375e-06, "loss": -0.0151, "num_tokens": 86156482.0, "reward": 1.4849108457565308, "reward_std": 0.07375740259885788, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48491087555885315, "rewards/correct_reward_func/std": 0.17290189862251282, "step": 661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4504.0, "completions/max_terminated_length": 4504.0, "completions/mean_length": 1467.297607421875, "completions/mean_terminated_length": 1467.297607421875, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 1.0311526479750779, "grad_norm": 0.6604934930801392, "kl": 0.054234541952610016, "learning_rate": 1.5931249999999999e-06, "loss": 0.0182, "num_tokens": 86285585.0, "reward": 1.4347630739212036, "reward_std": 0.0708894431591034, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4347630441188812, "rewards/correct_reward_func/std": 0.1494257152080536, "step": 662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2484.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 1498.416748046875, "completions/mean_terminated_length": 1498.416748046875, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 1.0327102803738317, "grad_norm": 0.5682238936424255, "kl": 0.05228089354932308, "learning_rate": 1.5925e-06, "loss": -0.0695, "num_tokens": 86417554.0, "reward": 1.4704004526138306, "reward_std": 0.09661635756492615, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4704003930091858, "rewards/correct_reward_func/std": 0.16588261723518372, "step": 663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 1710.15478515625, "completions/mean_terminated_length": 1632.0601806640625, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "epoch": 1.0342679127725856, "grad_norm": 0.565608024597168, "kl": 0.05014815367758274, "learning_rate": 1.591875e-06, "loss": 0.0822, "num_tokens": 86567447.0, "reward": 1.535957932472229, "reward_std": 0.08144499361515045, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5359576344490051, "rewards/correct_reward_func/std": 0.1807517409324646, "step": 664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1510.21435546875, "completions/mean_terminated_length": 1510.21435546875, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 1.0358255451713396, "grad_norm": 0.5581820607185364, "kl": 0.0530538372695446, "learning_rate": 1.59125e-06, "loss": -0.0007, "num_tokens": 86700389.0, "reward": 1.5879935026168823, "reward_std": 0.06466535478830338, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5879934430122375, "rewards/correct_reward_func/std": 0.17678600549697876, "step": 665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1603.9881591796875, "completions/mean_terminated_length": 1524.6143798828125, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 1.0373831775700935, "grad_norm": 0.5471597909927368, "kl": 0.0508502759039402, "learning_rate": 1.590625e-06, "loss": 0.0707, "num_tokens": 86841100.0, "reward": 1.536754846572876, "reward_std": 0.0733090415596962, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5367547273635864, "rewards/correct_reward_func/std": 0.13262318074703217, "step": 666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 1527.5833740234375, "completions/mean_terminated_length": 1527.5833740234375, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 1.0389408099688473, "grad_norm": 0.5596882104873657, "kl": 0.05326198227703571, "learning_rate": 1.59e-06, "loss": -0.0095, "num_tokens": 86975351.0, "reward": 1.5027247667312622, "reward_std": 0.06251788139343262, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5027247071266174, "rewards/correct_reward_func/std": 0.15841983258724213, "step": 667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 1483.297607421875, "completions/mean_terminated_length": 1483.297607421875, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 1.0404984423676011, "grad_norm": 0.5668331384658813, "kl": 0.05328033119440079, "learning_rate": 1.589375e-06, "loss": 0.0127, "num_tokens": 87105834.0, "reward": 1.5575391054153442, "reward_std": 0.09122274816036224, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5575389862060547, "rewards/correct_reward_func/std": 0.16401030123233795, "step": 668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 1532.452392578125, "completions/mean_terminated_length": 1532.452392578125, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 1.0420560747663552, "grad_norm": 0.5774728655815125, "kl": 0.0538950152695179, "learning_rate": 1.58875e-06, "loss": -0.0052, "num_tokens": 87240410.0, "reward": 1.4293346405029297, "reward_std": 0.045015521347522736, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4293345808982849, "rewards/correct_reward_func/std": 0.10337743908166885, "step": 669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2548.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 1630.46435546875, "completions/mean_terminated_length": 1630.46435546875, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 1.043613707165109, "grad_norm": 0.5443136692047119, "kl": 0.05456646718084812, "learning_rate": 1.588125e-06, "loss": 0.0483, "num_tokens": 87383351.0, "reward": 1.4322932958602905, "reward_std": 0.10355141013860703, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.45610272884368896, "rewards/correct_reward_func/std": 0.14299173653125763, "step": 670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 1638.0714111328125, "completions/mean_terminated_length": 1559.1083984375, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 1.0451713395638629, "grad_norm": 0.5622561573982239, "kl": 0.05328943021595478, "learning_rate": 1.5874999999999998e-06, "loss": 0.0366, "num_tokens": 87527021.0, "reward": 1.4649417400360107, "reward_std": 0.10010730475187302, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4768464267253876, "rewards/correct_reward_func/std": 0.13820233941078186, "step": 671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 1578.40478515625, "completions/mean_terminated_length": 1578.40478515625, "completions/min_length": 988.0, "completions/min_terminated_length": 988.0, "epoch": 1.0467289719626167, "grad_norm": 0.5663480758666992, "kl": 0.055856646969914436, "learning_rate": 1.586875e-06, "loss": -0.0032, "num_tokens": 87665691.0, "reward": 1.511879324913025, "reward_std": 0.047249529510736465, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5118792653083801, "rewards/correct_reward_func/std": 0.18857014179229736, "step": 672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1583.3095703125, "completions/mean_terminated_length": 1583.3095703125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "epoch": 1.0482866043613708, "grad_norm": 0.5549023747444153, "kl": 0.05541318096220493, "learning_rate": 1.5862499999999998e-06, "loss": -0.0078, "num_tokens": 87804701.0, "reward": 1.4863669872283936, "reward_std": 0.08935274183750153, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4982716739177704, "rewards/correct_reward_func/std": 0.11579611152410507, "step": 673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2302.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 1576.75, "completions/mean_terminated_length": 1576.75, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 1.0498442367601246, "grad_norm": 0.5554832220077515, "kl": 0.056341828778386116, "learning_rate": 1.585625e-06, "loss": 0.0043, "num_tokens": 87943214.0, "reward": 1.4435389041900635, "reward_std": 0.06262954324483871, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4435388445854187, "rewards/correct_reward_func/std": 0.1332552284002304, "step": 674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2357.0, "completions/max_terminated_length": 2357.0, "completions/mean_length": 1566.166748046875, "completions/mean_terminated_length": 1566.166748046875, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 1.0514018691588785, "grad_norm": 0.5815324187278748, "kl": 0.05804356001317501, "learning_rate": 1.5849999999999999e-06, "loss": 0.0252, "num_tokens": 88080898.0, "reward": 1.5119918584823608, "reward_std": 0.07926620543003082, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5238964557647705, "rewards/correct_reward_func/std": 0.1911584585905075, "step": 675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 1587.4405517578125, "completions/mean_terminated_length": 1587.4405517578125, "completions/min_length": 994.0, "completions/min_terminated_length": 994.0, "epoch": 1.0529595015576323, "grad_norm": 0.5776386857032776, "kl": 0.055761074647307396, "learning_rate": 1.584375e-06, "loss": 0.0302, "num_tokens": 88220291.0, "reward": 1.4661586284637451, "reward_std": 0.05945530906319618, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46615859866142273, "rewards/correct_reward_func/std": 0.11711110919713974, "step": 676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 1575.71435546875, "completions/mean_terminated_length": 1575.71435546875, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 1.0545171339563864, "grad_norm": 0.5745882987976074, "kl": 0.053962018340826035, "learning_rate": 1.5837499999999999e-06, "loss": 0.0093, "num_tokens": 88358657.0, "reward": 1.5064351558685303, "reward_std": 0.06469320505857468, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5064350962638855, "rewards/correct_reward_func/std": 0.15094655752182007, "step": 677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2058.0, "completions/max_terminated_length": 2058.0, "completions/mean_length": 1459.84521484375, "completions/mean_terminated_length": 1459.84521484375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 1.0560747663551402, "grad_norm": 0.5632085800170898, "kl": 0.0573277622461319, "learning_rate": 1.5831249999999998e-06, "loss": -0.0196, "num_tokens": 88487278.0, "reward": 1.5942164659500122, "reward_std": 0.08631883561611176, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.6061212420463562, "rewards/correct_reward_func/std": 0.1588619351387024, "step": 678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 1549.75, "completions/mean_terminated_length": 1549.75, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "epoch": 1.057632398753894, "grad_norm": 0.5981232523918152, "kl": 0.05635823681950569, "learning_rate": 1.5824999999999999e-06, "loss": 0.0156, "num_tokens": 88623529.0, "reward": 1.4801684617996216, "reward_std": 0.04865288734436035, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4801684021949768, "rewards/correct_reward_func/std": 0.14745546877384186, "step": 679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 1631.0714111328125, "completions/mean_terminated_length": 1552.0240478515625, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 1.0591900311526479, "grad_norm": 0.5360672473907471, "kl": 0.05281771533191204, "learning_rate": 1.5818749999999998e-06, "loss": 0.0423, "num_tokens": 88766485.0, "reward": 1.507429599761963, "reward_std": 0.05878061428666115, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5074294805526733, "rewards/correct_reward_func/std": 0.1723850518465042, "step": 680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 1525.7381591796875, "completions/mean_terminated_length": 1525.7381591796875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 1.060747663551402, "grad_norm": 0.5318810939788818, "kl": 0.055077340453863144, "learning_rate": 1.58125e-06, "loss": -0.0029, "num_tokens": 88900719.0, "reward": 1.5089659690856934, "reward_std": 0.060308195650577545, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5089658498764038, "rewards/correct_reward_func/std": 0.17798349261283875, "step": 681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 1492.8214111328125, "completions/mean_terminated_length": 1492.8214111328125, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 1.0623052959501558, "grad_norm": 0.6031827330589294, "kl": 0.05352173000574112, "learning_rate": 1.5806249999999998e-06, "loss": -0.0015, "num_tokens": 89031852.0, "reward": 1.4641478061676025, "reward_std": 0.06496328860521317, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4641478359699249, "rewards/correct_reward_func/std": 0.1625915914773941, "step": 682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 1627.09521484375, "completions/mean_terminated_length": 1548.0, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 1.0638629283489096, "grad_norm": 0.5318005084991455, "kl": 0.05356441251933575, "learning_rate": 1.58e-06, "loss": 0.0701, "num_tokens": 89174522.0, "reward": 1.5102105140686035, "reward_std": 0.09376493096351624, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5221152305603027, "rewards/correct_reward_func/std": 0.16014137864112854, "step": 683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1641.2381591796875, "completions/mean_terminated_length": 1562.313232421875, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 1.0654205607476634, "grad_norm": 0.5863988995552063, "kl": 0.055096494033932686, "learning_rate": 1.5793749999999998e-06, "loss": 0.0739, "num_tokens": 89318332.0, "reward": 1.4779866933822632, "reward_std": 0.0922832116484642, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48989132046699524, "rewards/correct_reward_func/std": 0.1688835322856903, "step": 684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 1586.9405517578125, "completions/mean_terminated_length": 1586.9405517578125, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 1.0669781931464175, "grad_norm": 0.5507916212081909, "kl": 0.05459226667881012, "learning_rate": 1.5787500000000001e-06, "loss": -0.0182, "num_tokens": 89457557.0, "reward": 1.543197512626648, "reward_std": 0.06276614218950272, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5431973338127136, "rewards/correct_reward_func/std": 0.18692582845687866, "step": 685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4759.0, "completions/max_terminated_length": 4759.0, "completions/mean_length": 1548.416748046875, "completions/mean_terminated_length": 1548.416748046875, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 1.0685358255451713, "grad_norm": 0.5588350296020508, "kl": 0.05437791533768177, "learning_rate": 1.578125e-06, "loss": -0.0017, "num_tokens": 89593504.0, "reward": 1.4723842144012451, "reward_std": 0.06399713456630707, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4723840355873108, "rewards/correct_reward_func/std": 0.15390437841415405, "step": 686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3083.0, "completions/max_terminated_length": 3083.0, "completions/mean_length": 1471.7738037109375, "completions/mean_terminated_length": 1471.7738037109375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 1.0700934579439252, "grad_norm": 0.6055949926376343, "kl": 0.05486376769840717, "learning_rate": 1.5775e-06, "loss": -0.0051, "num_tokens": 89723055.0, "reward": 1.460752010345459, "reward_std": 0.09401778876781464, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.472656786441803, "rewards/correct_reward_func/std": 0.18991245329380035, "step": 687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 1501.357177734375, "completions/mean_terminated_length": 1501.357177734375, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 1.071651090342679, "grad_norm": 0.6138256788253784, "kl": 0.0549286063760519, "learning_rate": 1.576875e-06, "loss": 0.0095, "num_tokens": 89855031.0, "reward": 1.452558994293213, "reward_std": 0.062143485993146896, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45255884528160095, "rewards/correct_reward_func/std": 0.14738577604293823, "step": 688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1508.857177734375, "completions/mean_terminated_length": 1508.857177734375, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 1.073208722741433, "grad_norm": 0.5856159329414368, "kl": 0.05561050772666931, "learning_rate": 1.57625e-06, "loss": 0.0011, "num_tokens": 89987721.0, "reward": 1.4332712888717651, "reward_std": 0.04271453246474266, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4332713186740875, "rewards/correct_reward_func/std": 0.11550971120595932, "step": 689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2183.0, "completions/max_terminated_length": 2183.0, "completions/mean_length": 1527.1785888671875, "completions/mean_terminated_length": 1527.1785888671875, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 1.074766355140187, "grad_norm": 0.6025081276893616, "kl": 0.05726106837391853, "learning_rate": 1.575625e-06, "loss": 0.0185, "num_tokens": 90121794.0, "reward": 1.4257436990737915, "reward_std": 0.09166575968265533, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43764835596084595, "rewards/correct_reward_func/std": 0.14188778400421143, "step": 690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2313.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 1449.4761962890625, "completions/mean_terminated_length": 1449.4761962890625, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 1.0763239875389408, "grad_norm": 0.6137524843215942, "kl": 0.059694841504096985, "learning_rate": 1.575e-06, "loss": -0.0008, "num_tokens": 90249376.0, "reward": 1.5255221128463745, "reward_std": 0.05913606286048889, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5255220532417297, "rewards/correct_reward_func/std": 0.1678466647863388, "step": 691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2680.0, "completions/max_terminated_length": 2680.0, "completions/mean_length": 1516.2261962890625, "completions/mean_terminated_length": 1516.2261962890625, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 1.0778816199376946, "grad_norm": 0.5627336502075195, "kl": 0.052463850006461143, "learning_rate": 1.574375e-06, "loss": 0.0015, "num_tokens": 90382571.0, "reward": 1.518011450767517, "reward_std": 0.06652691215276718, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5180113911628723, "rewards/correct_reward_func/std": 0.13873633742332458, "step": 692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 1571.6429443359375, "completions/mean_terminated_length": 1491.8795166015625, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "epoch": 1.0794392523364487, "grad_norm": 0.5776975154876709, "kl": 0.05454135872423649, "learning_rate": 1.57375e-06, "loss": 0.0496, "num_tokens": 90520505.0, "reward": 1.4717916250228882, "reward_std": 0.09959909319877625, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4956010580062866, "rewards/correct_reward_func/std": 0.12597030401229858, "step": 693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1448.7381591796875, "completions/mean_terminated_length": 1448.7381591796875, "completions/min_length": 989.0, "completions/min_terminated_length": 989.0, "epoch": 1.0809968847352025, "grad_norm": 0.6719677448272705, "kl": 0.054130397737026215, "learning_rate": 1.573125e-06, "loss": 0.0117, "num_tokens": 90648091.0, "reward": 1.4858334064483643, "reward_std": 0.048804186284542084, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4858333170413971, "rewards/correct_reward_func/std": 0.12014701217412949, "step": 694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 1482.857177734375, "completions/mean_terminated_length": 1482.857177734375, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 1.0825545171339563, "grad_norm": 0.605505645275116, "kl": 0.05565035529434681, "learning_rate": 1.5725e-06, "loss": 0.0116, "num_tokens": 90778639.0, "reward": 1.4878506660461426, "reward_std": 0.05565100908279419, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4878506362438202, "rewards/correct_reward_func/std": 0.15627160668373108, "step": 695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6738.0, "completions/max_terminated_length": 6738.0, "completions/mean_length": 1650.5, "completions/mean_terminated_length": 1650.5, "completions/min_length": 1163.0, "completions/min_terminated_length": 1163.0, "epoch": 1.0841121495327102, "grad_norm": 0.5941537618637085, "kl": 0.051503732800483704, "learning_rate": 1.5718749999999999e-06, "loss": -0.0238, "num_tokens": 90923377.0, "reward": 1.477596640586853, "reward_std": 0.056624628603458405, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4775967299938202, "rewards/correct_reward_func/std": 0.1360875815153122, "step": 696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 1535.0, "completions/mean_terminated_length": 1535.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 1.0856697819314642, "grad_norm": 0.5858432054519653, "kl": 0.053479718044400215, "learning_rate": 1.57125e-06, "loss": 0.0207, "num_tokens": 91058353.0, "reward": 1.4727427959442139, "reward_std": 0.040079839527606964, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47274258732795715, "rewards/correct_reward_func/std": 0.10987861454486847, "step": 697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 1514.5357666015625, "completions/mean_terminated_length": 1514.5357666015625, "completions/min_length": 1071.0, "completions/min_terminated_length": 1071.0, "epoch": 1.087227414330218, "grad_norm": 0.5789026618003845, "kl": 0.054706670343875885, "learning_rate": 1.5706249999999999e-06, "loss": 0.0078, "num_tokens": 91191598.0, "reward": 1.528199315071106, "reward_std": 0.13074904680252075, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.5639137029647827, "rewards/correct_reward_func/std": 0.17987313866615295, "step": 698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 1555.3333740234375, "completions/mean_terminated_length": 1555.3333740234375, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "epoch": 1.088785046728972, "grad_norm": 0.551451563835144, "kl": 0.05342511832714081, "learning_rate": 1.57e-06, "loss": -0.0102, "num_tokens": 91328390.0, "reward": 1.528260588645935, "reward_std": 0.05848051980137825, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5282606482505798, "rewards/correct_reward_func/std": 0.16042807698249817, "step": 699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 1527.8333740234375, "completions/mean_terminated_length": 1527.8333740234375, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 1.0903426791277258, "grad_norm": 0.58552086353302, "kl": 0.05442274548113346, "learning_rate": 1.569375e-06, "loss": 0.029, "num_tokens": 91462740.0, "reward": 1.5053088665008545, "reward_std": 0.06954223662614822, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5053088068962097, "rewards/correct_reward_func/std": 0.16963563859462738, "step": 700 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1481.8690185546875, "completions/mean_terminated_length": 1401.0240478515625, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 1.0919003115264798, "grad_norm": 0.590255618095398, "kl": 0.05315079167485237, "learning_rate": 1.56875e-06, "loss": 0.069, "num_tokens": 91593115.0, "reward": 1.4392790794372559, "reward_std": 0.10477132350206375, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4511837661266327, "rewards/correct_reward_func/std": 0.15620967745780945, "step": 701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 1494.0833740234375, "completions/mean_terminated_length": 1494.0833740234375, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 1.0934579439252337, "grad_norm": 0.5574760437011719, "kl": 0.053141290321946144, "learning_rate": 1.568125e-06, "loss": -0.0069, "num_tokens": 91724666.0, "reward": 1.500420331954956, "reward_std": 0.08308660238981247, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5123249888420105, "rewards/correct_reward_func/std": 0.17106066644191742, "step": 702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 1462.8809814453125, "completions/mean_terminated_length": 1462.8809814453125, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 1.0950155763239875, "grad_norm": 0.5716654062271118, "kl": 0.05562962777912617, "learning_rate": 1.5674999999999998e-06, "loss": 0.0293, "num_tokens": 91853392.0, "reward": 1.494890570640564, "reward_std": 0.05258150026202202, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4948905408382416, "rewards/correct_reward_func/std": 0.1593448370695114, "step": 703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3112.0, "completions/max_terminated_length": 3112.0, "completions/mean_length": 1546.15478515625, "completions/mean_terminated_length": 1546.15478515625, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 1.0965732087227413, "grad_norm": 0.5487161874771118, "kl": 0.053345413878560066, "learning_rate": 1.566875e-06, "loss": -0.0122, "num_tokens": 91989155.0, "reward": 1.492063045501709, "reward_std": 0.06741683930158615, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49206307530403137, "rewards/correct_reward_func/std": 0.11516913026571274, "step": 704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2438.0, "completions/max_terminated_length": 2438.0, "completions/mean_length": 1550.75, "completions/mean_terminated_length": 1550.75, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 1.0981308411214954, "grad_norm": 0.5476036071777344, "kl": 0.052486877888441086, "learning_rate": 1.5662499999999998e-06, "loss": 0.0187, "num_tokens": 92125592.0, "reward": 1.5012890100479126, "reward_std": 0.047597870230674744, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5012890100479126, "rewards/correct_reward_func/std": 0.13995447754859924, "step": 705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2252.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 1472.666748046875, "completions/mean_terminated_length": 1472.666748046875, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 1.0996884735202492, "grad_norm": 0.6492244601249695, "kl": 0.05697629973292351, "learning_rate": 1.565625e-06, "loss": 0.004, "num_tokens": 92255038.0, "reward": 1.5068786144256592, "reward_std": 0.05082269757986069, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5068784952163696, "rewards/correct_reward_func/std": 0.17205660045146942, "step": 706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2334.0, "completions/max_terminated_length": 2334.0, "completions/mean_length": 1499.5357666015625, "completions/mean_terminated_length": 1499.5357666015625, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 1.101246105919003, "grad_norm": 0.6129737496376038, "kl": 0.05704494006931782, "learning_rate": 1.5649999999999998e-06, "loss": -0.0005, "num_tokens": 92386837.0, "reward": 1.4853978157043457, "reward_std": 0.09432291984558105, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49730247259140015, "rewards/correct_reward_func/std": 0.16147708892822266, "step": 707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2941.0, "completions/max_terminated_length": 2941.0, "completions/mean_length": 1572.71435546875, "completions/mean_terminated_length": 1572.71435546875, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "epoch": 1.102803738317757, "grad_norm": 0.5524822473526001, "kl": 0.0564372930675745, "learning_rate": 1.564375e-06, "loss": 0.0266, "num_tokens": 92524921.0, "reward": 1.5710501670837402, "reward_std": 0.06597508490085602, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5710501670837402, "rewards/correct_reward_func/std": 0.14614151418209076, "step": 708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2490.0, "completions/mean_length": 1586.166748046875, "completions/mean_terminated_length": 1506.5782470703125, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 1.104361370716511, "grad_norm": 0.6021602153778076, "kl": 0.05207609385251999, "learning_rate": 1.5637499999999999e-06, "loss": 0.068, "num_tokens": 92664129.0, "reward": 1.3836668729782104, "reward_std": 0.12153889238834381, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.40747642517089844, "rewards/correct_reward_func/std": 0.16015641391277313, "step": 709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2707.0, "completions/mean_length": 1604.2381591796875, "completions/mean_terminated_length": 1524.867431640625, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 1.1059190031152648, "grad_norm": 0.5589121580123901, "kl": 0.05346103943884373, "learning_rate": 1.563125e-06, "loss": 0.0441, "num_tokens": 92804819.0, "reward": 1.4974178075790405, "reward_std": 0.1249975860118866, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5212271809577942, "rewards/correct_reward_func/std": 0.1472575068473816, "step": 710 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2256.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 1471.5714111328125, "completions/mean_terminated_length": 1471.5714111328125, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 1.1074766355140186, "grad_norm": 0.591182291507721, "kl": 0.05628206580877304, "learning_rate": 1.5624999999999999e-06, "loss": -0.0112, "num_tokens": 92934359.0, "reward": 1.4635940790176392, "reward_std": 0.04556784778833389, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46359410881996155, "rewards/correct_reward_func/std": 0.13316887617111206, "step": 711 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 1537.1905517578125, "completions/mean_terminated_length": 1537.1905517578125, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 1.1090342679127725, "grad_norm": 0.5901696681976318, "kl": 0.05872597172856331, "learning_rate": 1.5618749999999998e-06, "loss": 0.0229, "num_tokens": 93069801.0, "reward": 1.4981411695480347, "reward_std": 0.05785054340958595, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4981410503387451, "rewards/correct_reward_func/std": 0.16957618296146393, "step": 712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1465.3929443359375, "completions/mean_terminated_length": 1465.3929443359375, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 1.1105919003115265, "grad_norm": 0.5794321298599243, "kl": 0.05666721984744072, "learning_rate": 1.5612499999999999e-06, "loss": 0.0165, "num_tokens": 93198726.0, "reward": 1.4143143892288208, "reward_std": 0.09174053370952606, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337099134922028, "rewards/correct_reward_func/mean": 0.4381239414215088, "rewards/correct_reward_func/std": 0.14089109003543854, "step": 713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2221.0, "completions/max_terminated_length": 2221.0, "completions/mean_length": 1468.0, "completions/mean_terminated_length": 1468.0, "completions/min_length": 808.0, "completions/min_terminated_length": 808.0, "epoch": 1.1121495327102804, "grad_norm": 0.6168317794799805, "kl": 0.058275070041418076, "learning_rate": 1.5606249999999998e-06, "loss": 0.0096, "num_tokens": 93327786.0, "reward": 1.4813557863235474, "reward_std": 0.061152730137109756, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.481355756521225, "rewards/correct_reward_func/std": 0.12457802891731262, "step": 714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 1542.7261962890625, "completions/mean_terminated_length": 1542.7261962890625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 1.1137071651090342, "grad_norm": 0.5758678913116455, "kl": 0.05532221123576164, "learning_rate": 1.5599999999999999e-06, "loss": -0.0205, "num_tokens": 93463693.0, "reward": 1.5047788619995117, "reward_std": 0.07527534663677216, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5166835784912109, "rewards/correct_reward_func/std": 0.14771386981010437, "step": 715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5000.0, "completions/max_terminated_length": 5000.0, "completions/mean_length": 1462.511962890625, "completions/mean_terminated_length": 1462.511962890625, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 1.115264797507788, "grad_norm": 0.6303721070289612, "kl": 0.056314244866371155, "learning_rate": 1.5593749999999998e-06, "loss": 0.031, "num_tokens": 93592424.0, "reward": 1.4838083982467651, "reward_std": 0.0898590162396431, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4957130551338196, "rewards/correct_reward_func/std": 0.13271214067935944, "step": 716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 1583.261962890625, "completions/mean_terminated_length": 1503.6385498046875, "completions/min_length": 1005.0, "completions/min_terminated_length": 1005.0, "epoch": 1.1168224299065421, "grad_norm": 0.5468500852584839, "kl": 0.0574983898550272, "learning_rate": 1.5587500000000001e-06, "loss": 0.0374, "num_tokens": 93731454.0, "reward": 1.3953533172607422, "reward_std": 0.08246518671512604, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4072580933570862, "rewards/correct_reward_func/std": 0.11090698093175888, "step": 717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 1642.4761962890625, "completions/mean_terminated_length": 1563.566162109375, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 1.118380062305296, "grad_norm": 0.542092502117157, "kl": 0.054435987025499344, "learning_rate": 1.558125e-06, "loss": 0.0746, "num_tokens": 93875542.0, "reward": 1.5064208507537842, "reward_std": 0.056892890483140945, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5064208507537842, "rewards/correct_reward_func/std": 0.17459480464458466, "step": 718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 1482.15478515625, "completions/mean_terminated_length": 1482.15478515625, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 1.1199376947040498, "grad_norm": 0.6088550686836243, "kl": 0.057859404012560844, "learning_rate": 1.5575000000000001e-06, "loss": -0.015, "num_tokens": 94005959.0, "reward": 1.4850451946258545, "reward_std": 0.07419510185718536, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4969499111175537, "rewards/correct_reward_func/std": 0.16233623027801514, "step": 719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 1441.0, "completions/mean_terminated_length": 1441.0, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 1.1214953271028036, "grad_norm": 0.6208702325820923, "kl": 0.05701233819127083, "learning_rate": 1.556875e-06, "loss": -0.0108, "num_tokens": 94132781.0, "reward": 1.4269475936889648, "reward_std": 0.08062295615673065, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4388522207736969, "rewards/correct_reward_func/std": 0.11248524487018585, "step": 720 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1485.1429443359375, "completions/mean_terminated_length": 1485.1429443359375, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 1.1230529595015577, "grad_norm": 0.6701772212982178, "kl": 0.055626364424824715, "learning_rate": 1.55625e-06, "loss": 0.0032, "num_tokens": 94263623.0, "reward": 1.5248923301696777, "reward_std": 0.06717394292354584, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.524892270565033, "rewards/correct_reward_func/std": 0.14444495737552643, "step": 721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 1517.5238037109375, "completions/mean_terminated_length": 1517.5238037109375, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 1.1246105919003115, "grad_norm": 0.5508123636245728, "kl": 0.05782832205295563, "learning_rate": 1.555625e-06, "loss": 0.015, "num_tokens": 94397047.0, "reward": 1.4941964149475098, "reward_std": 0.07925277948379517, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.506101131439209, "rewards/correct_reward_func/std": 0.2129984050989151, "step": 722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1478.202392578125, "completions/mean_terminated_length": 1478.202392578125, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 1.1261682242990654, "grad_norm": 0.612338125705719, "kl": 0.05537046305835247, "learning_rate": 1.555e-06, "loss": 0.0268, "num_tokens": 94527312.0, "reward": 1.466044306755066, "reward_std": 0.04727572575211525, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46604427695274353, "rewards/correct_reward_func/std": 0.12914837896823883, "step": 723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 1491.8333740234375, "completions/mean_terminated_length": 1491.8333740234375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.1277258566978192, "grad_norm": 0.5914537310600281, "kl": 0.05829664133489132, "learning_rate": 1.554375e-06, "loss": 0.0273, "num_tokens": 94658812.0, "reward": 1.4482581615447998, "reward_std": 0.07615670561790466, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4601627588272095, "rewards/correct_reward_func/std": 0.13598045706748962, "step": 724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 1619.916748046875, "completions/mean_terminated_length": 1619.916748046875, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 1.1292834890965733, "grad_norm": 0.5755424499511719, "kl": 0.05564289353787899, "learning_rate": 1.55375e-06, "loss": 0.0026, "num_tokens": 94800945.0, "reward": 1.4549850225448608, "reward_std": 0.08374127745628357, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4668896198272705, "rewards/correct_reward_func/std": 0.13782913982868195, "step": 725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 1505.547607421875, "completions/mean_terminated_length": 1505.547607421875, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 1.1308411214953271, "grad_norm": 0.6021931767463684, "kl": 0.057859089225530624, "learning_rate": 1.553125e-06, "loss": 0.0118, "num_tokens": 94933441.0, "reward": 1.5193341970443726, "reward_std": 0.05812664330005646, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5193341374397278, "rewards/correct_reward_func/std": 0.12780743837356567, "step": 726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2649.0, "completions/max_terminated_length": 2649.0, "completions/mean_length": 1566.25, "completions/mean_terminated_length": 1566.25, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 1.132398753894081, "grad_norm": 0.6102822422981262, "kl": 0.05513947270810604, "learning_rate": 1.5525e-06, "loss": -0.0043, "num_tokens": 95071114.0, "reward": 1.4160356521606445, "reward_std": 0.09539022296667099, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.427940309047699, "rewards/correct_reward_func/std": 0.12938934564590454, "step": 727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 1524.96435546875, "completions/mean_terminated_length": 1524.96435546875, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 1.1339563862928348, "grad_norm": 0.6332623362541199, "kl": 0.05801095813512802, "learning_rate": 1.5518749999999999e-06, "loss": -0.0168, "num_tokens": 95205157.0, "reward": 1.424320101737976, "reward_std": 0.09377728402614594, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4481295943260193, "rewards/correct_reward_func/std": 0.16886721551418304, "step": 728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2660.0, "completions/mean_length": 1638.5714111328125, "completions/mean_terminated_length": 1559.6143798828125, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 1.1355140186915889, "grad_norm": 0.5859886407852173, "kl": 0.05399777367711067, "learning_rate": 1.55125e-06, "loss": 0.0627, "num_tokens": 95348821.0, "reward": 1.4587374925613403, "reward_std": 0.060993000864982605, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45873746275901794, "rewards/correct_reward_func/std": 0.14003029465675354, "step": 729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 1450.416748046875, "completions/mean_terminated_length": 1450.416748046875, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "epoch": 1.1370716510903427, "grad_norm": 0.5973284840583801, "kl": 0.0573828537017107, "learning_rate": 1.5506249999999999e-06, "loss": 0.0113, "num_tokens": 95476494.0, "reward": 1.4502209424972534, "reward_std": 0.060138970613479614, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45022091269493103, "rewards/correct_reward_func/std": 0.15389184653759003, "step": 730 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2202.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 1481.25, "completions/mean_terminated_length": 1481.25, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 1.1386292834890965, "grad_norm": 0.6443908214569092, "kl": 0.05530379340052605, "learning_rate": 1.55e-06, "loss": 0.0496, "num_tokens": 95606907.0, "reward": 1.4538649320602417, "reward_std": 0.10928157716989517, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4776744246482849, "rewards/correct_reward_func/std": 0.12328770011663437, "step": 731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 1530.46435546875, "completions/mean_terminated_length": 1530.46435546875, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 1.1401869158878504, "grad_norm": 0.5932245850563049, "kl": 0.055447425693273544, "learning_rate": 1.5493749999999999e-06, "loss": -0.0035, "num_tokens": 95741304.0, "reward": 1.5386000871658325, "reward_std": 0.05185175687074661, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5385999083518982, "rewards/correct_reward_func/std": 0.18387262523174286, "step": 732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 1560.21435546875, "completions/mean_terminated_length": 1560.21435546875, "completions/min_length": 400.0, "completions/min_terminated_length": 400.0, "epoch": 1.1417445482866044, "grad_norm": 0.5909774899482727, "kl": 0.05969870463013649, "learning_rate": 1.54875e-06, "loss": -0.0359, "num_tokens": 95878398.0, "reward": 1.5041146278381348, "reward_std": 0.061541296541690826, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5041146278381348, "rewards/correct_reward_func/std": 0.19126980006694794, "step": 733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3407.0, "completions/max_terminated_length": 3407.0, "completions/mean_length": 1556.6429443359375, "completions/mean_terminated_length": 1556.6429443359375, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 1.1433021806853583, "grad_norm": 0.603911817073822, "kl": 0.05567294545471668, "learning_rate": 1.548125e-06, "loss": -0.0363, "num_tokens": 96015054.0, "reward": 1.4612985849380493, "reward_std": 0.0461239218711853, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46129852533340454, "rewards/correct_reward_func/std": 0.14767718315124512, "step": 734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 1577.1905517578125, "completions/mean_terminated_length": 1497.493896484375, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 1.144859813084112, "grad_norm": 0.5522368550300598, "kl": 0.057751892134547234, "learning_rate": 1.5475e-06, "loss": 0.035, "num_tokens": 96153442.0, "reward": 1.4211385250091553, "reward_std": 0.07991094887256622, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4211384952068329, "rewards/correct_reward_func/std": 0.1746172308921814, "step": 735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3317.0, "completions/max_terminated_length": 3317.0, "completions/mean_length": 1527.6309814453125, "completions/mean_terminated_length": 1527.6309814453125, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 1.146417445482866, "grad_norm": 0.5832193493843079, "kl": 0.05622788146138191, "learning_rate": 1.546875e-06, "loss": -0.008, "num_tokens": 96287799.0, "reward": 1.5053026676177979, "reward_std": 0.08288029581308365, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5053026080131531, "rewards/correct_reward_func/std": 0.13812501728534698, "step": 736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 1501.9405517578125, "completions/mean_terminated_length": 1501.9405517578125, "completions/min_length": 972.0, "completions/min_terminated_length": 972.0, "epoch": 1.14797507788162, "grad_norm": 0.5426509976387024, "kl": 0.05514790490269661, "learning_rate": 1.5462499999999998e-06, "loss": 0.0405, "num_tokens": 96419938.0, "reward": 1.5058940649032593, "reward_std": 0.05818099156022072, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.505893886089325, "rewards/correct_reward_func/std": 0.14715726673603058, "step": 737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 1483.7381591796875, "completions/mean_terminated_length": 1483.7381591796875, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 1.1495327102803738, "grad_norm": 0.6014995574951172, "kl": 0.054408157244324684, "learning_rate": 1.545625e-06, "loss": -0.0165, "num_tokens": 96550512.0, "reward": 1.494113802909851, "reward_std": 0.08319792151451111, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5060185194015503, "rewards/correct_reward_func/std": 0.1316477656364441, "step": 738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1474.0, "completions/mean_terminated_length": 1474.0, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 1.1510903426791277, "grad_norm": 0.5481889843940735, "kl": 0.05591421760618687, "learning_rate": 1.5449999999999998e-06, "loss": -0.0051, "num_tokens": 96680106.0, "reward": 1.4785151481628418, "reward_std": 0.09400417655706406, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49041974544525146, "rewards/correct_reward_func/std": 0.12953902781009674, "step": 739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2051.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 1488.416748046875, "completions/mean_terminated_length": 1488.416748046875, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "epoch": 1.1526479750778815, "grad_norm": 0.6241686940193176, "kl": 0.05947162210941315, "learning_rate": 1.544375e-06, "loss": 0.0004, "num_tokens": 96811097.0, "reward": 1.471253752708435, "reward_std": 0.09001534432172775, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4831584393978119, "rewards/correct_reward_func/std": 0.13641098141670227, "step": 740 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1477.65478515625, "completions/mean_terminated_length": 1477.65478515625, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 1.1542056074766356, "grad_norm": 0.6100274324417114, "kl": 0.05378861352801323, "learning_rate": 1.5437499999999998e-06, "loss": 0.0034, "num_tokens": 96941142.0, "reward": 1.552506446838379, "reward_std": 0.06839028745889664, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5525063872337341, "rewards/correct_reward_func/std": 0.16896046698093414, "step": 741 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2424.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 1511.71435546875, "completions/mean_terminated_length": 1511.71435546875, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 1.1557632398753894, "grad_norm": 0.5556143522262573, "kl": 0.054157646372914314, "learning_rate": 1.543125e-06, "loss": 0.0093, "num_tokens": 97074222.0, "reward": 1.5962690114974976, "reward_std": 0.07085416465997696, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5962690711021423, "rewards/correct_reward_func/std": 0.1648954302072525, "step": 742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1490.90478515625, "completions/mean_terminated_length": 1490.90478515625, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 1.1573208722741433, "grad_norm": 0.5736614465713501, "kl": 0.05447908118367195, "learning_rate": 1.5424999999999998e-06, "loss": -0.0221, "num_tokens": 97205578.0, "reward": 1.4858235120773315, "reward_std": 0.06773627549409866, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.485823392868042, "rewards/correct_reward_func/std": 0.17029505968093872, "step": 743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1460.4881591796875, "completions/mean_terminated_length": 1460.4881591796875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 1.158878504672897, "grad_norm": 0.5949994921684265, "kl": 0.057084500789642334, "learning_rate": 1.541875e-06, "loss": 0.018, "num_tokens": 97334043.0, "reward": 1.4434999227523804, "reward_std": 0.045735184103250504, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4434998035430908, "rewards/correct_reward_func/std": 0.12068561464548111, "step": 744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2448.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 1577.5595703125, "completions/mean_terminated_length": 1577.5595703125, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "epoch": 1.1604361370716512, "grad_norm": 0.5615612864494324, "kl": 0.05583513341844082, "learning_rate": 1.5412499999999999e-06, "loss": -0.0103, "num_tokens": 97472726.0, "reward": 1.538543939590454, "reward_std": 0.08571602404117584, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5504487156867981, "rewards/correct_reward_func/std": 0.15577484667301178, "step": 745 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1543.857177734375, "completions/mean_terminated_length": 1463.759033203125, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 1.161993769470405, "grad_norm": 0.5778390765190125, "kl": 0.05601225048303604, "learning_rate": 1.5406249999999998e-06, "loss": 0.0514, "num_tokens": 97608350.0, "reward": 1.4838290214538574, "reward_std": 0.10284449905157089, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49573376774787903, "rewards/correct_reward_func/std": 0.18433628976345062, "step": 746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 1518.25, "completions/mean_terminated_length": 1518.25, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 1.1635514018691588, "grad_norm": 0.5913203358650208, "kl": 0.05522367171943188, "learning_rate": 1.5399999999999999e-06, "loss": 0.0038, "num_tokens": 97741853.0, "reward": 1.5266317129135132, "reward_std": 0.08024472743272781, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5385364294052124, "rewards/correct_reward_func/std": 0.20109929144382477, "step": 747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2736.0, "completions/max_terminated_length": 2736.0, "completions/mean_length": 1569.3929443359375, "completions/mean_terminated_length": 1569.3929443359375, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 1.1651090342679127, "grad_norm": 0.5452720522880554, "kl": 0.05439308471977711, "learning_rate": 1.5393749999999998e-06, "loss": 0.0348, "num_tokens": 97879742.0, "reward": 1.4626656770706177, "reward_std": 0.06041482463479042, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4626656174659729, "rewards/correct_reward_func/std": 0.11640315502882004, "step": 748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 1567.8333740234375, "completions/mean_terminated_length": 1567.8333740234375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 1.1666666666666667, "grad_norm": 0.6283567547798157, "kl": 0.05693612061440945, "learning_rate": 1.53875e-06, "loss": -0.0058, "num_tokens": 98017476.0, "reward": 1.4697226285934448, "reward_std": 0.08482564985752106, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4816272258758545, "rewards/correct_reward_func/std": 0.1434738039970398, "step": 749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2773.0, "completions/mean_length": 1612.59521484375, "completions/mean_terminated_length": 1533.3251953125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 1.1682242990654206, "grad_norm": 0.5389671325683594, "kl": 0.05255095474421978, "learning_rate": 1.538125e-06, "loss": 0.0242, "num_tokens": 98158934.0, "reward": 1.4848209619522095, "reward_std": 0.09422644972801208, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49672576785087585, "rewards/correct_reward_func/std": 0.184385284781456, "step": 750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 1567.71435546875, "completions/mean_terminated_length": 1567.71435546875, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 1.1697819314641744, "grad_norm": 0.548118531703949, "kl": 0.053853169083595276, "learning_rate": 1.5375e-06, "loss": -0.0182, "num_tokens": 98296712.0, "reward": 1.4945675134658813, "reward_std": 0.05834171921014786, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4945674240589142, "rewards/correct_reward_func/std": 0.15054771304130554, "step": 751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 1503.90478515625, "completions/mean_terminated_length": 1503.90478515625, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 1.1713395638629283, "grad_norm": 0.5890027284622192, "kl": 0.056156937032938004, "learning_rate": 1.536875e-06, "loss": -0.0101, "num_tokens": 98428914.0, "reward": 1.4326536655426025, "reward_std": 0.0644221380352974, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43265366554260254, "rewards/correct_reward_func/std": 0.13742317259311676, "step": 752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1571.2738037109375, "completions/mean_terminated_length": 1491.5059814453125, "completions/min_length": 984.0, "completions/min_terminated_length": 984.0, "epoch": 1.1728971962616823, "grad_norm": 0.5988878607749939, "kl": 0.052749618887901306, "learning_rate": 1.53625e-06, "loss": 0.0316, "num_tokens": 98566955.0, "reward": 1.5509055852890015, "reward_std": 0.09258174151182175, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5628102421760559, "rewards/correct_reward_func/std": 0.15725326538085938, "step": 753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 1547.3929443359375, "completions/mean_terminated_length": 1547.3929443359375, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 1.1744548286604362, "grad_norm": 0.5628390312194824, "kl": 0.057125985622406006, "learning_rate": 1.535625e-06, "loss": -0.0267, "num_tokens": 98703200.0, "reward": 1.5585520267486572, "reward_std": 0.1051463857293129, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5823614001274109, "rewards/correct_reward_func/std": 0.1266818791627884, "step": 754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3773.0, "completions/max_terminated_length": 3773.0, "completions/mean_length": 1583.25, "completions/mean_terminated_length": 1583.25, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 1.17601246105919, "grad_norm": 0.6228774785995483, "kl": 0.05425914190709591, "learning_rate": 1.535e-06, "loss": 0.026, "num_tokens": 98842133.0, "reward": 1.515499472618103, "reward_std": 0.07798980176448822, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5154992938041687, "rewards/correct_reward_func/std": 0.17263923585414886, "step": 755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2779.0, "completions/max_terminated_length": 2779.0, "completions/mean_length": 1557.3929443359375, "completions/mean_terminated_length": 1557.3929443359375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "epoch": 1.1775700934579438, "grad_norm": 0.5741190910339355, "kl": 0.054265307262539864, "learning_rate": 1.534375e-06, "loss": -0.0196, "num_tokens": 98978948.0, "reward": 1.4811805486679077, "reward_std": 0.05035428702831268, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48118042945861816, "rewards/correct_reward_func/std": 0.15768390893936157, "step": 756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3854.0, "completions/max_terminated_length": 3854.0, "completions/mean_length": 1670.9405517578125, "completions/mean_terminated_length": 1670.9405517578125, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "epoch": 1.179127725856698, "grad_norm": 0.5553456544876099, "kl": 0.0542374923825264, "learning_rate": 1.53375e-06, "loss": -0.0191, "num_tokens": 99125421.0, "reward": 1.4518465995788574, "reward_std": 0.10473170131444931, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4756561517715454, "rewards/correct_reward_func/std": 0.1895512491464615, "step": 757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6732.0, "completions/max_terminated_length": 6732.0, "completions/mean_length": 1586.0, "completions/mean_terminated_length": 1586.0, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "epoch": 1.1806853582554517, "grad_norm": 0.5976909399032593, "kl": 0.05252276733517647, "learning_rate": 1.533125e-06, "loss": -0.0277, "num_tokens": 99264657.0, "reward": 1.541321873664856, "reward_std": 0.06658712029457092, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5413219928741455, "rewards/correct_reward_func/std": 0.15267948806285858, "step": 758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1618.4405517578125, "completions/mean_terminated_length": 1539.240966796875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 1.1822429906542056, "grad_norm": 0.5704209804534912, "kl": 0.053550003096461296, "learning_rate": 1.5325e-06, "loss": 0.0664, "num_tokens": 99406582.0, "reward": 1.5433764457702637, "reward_std": 0.07263385504484177, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5433762669563293, "rewards/correct_reward_func/std": 0.1595163196325302, "step": 759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2675.0, "completions/max_terminated_length": 2675.0, "completions/mean_length": 1665.4405517578125, "completions/mean_terminated_length": 1665.4405517578125, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 1.1838006230529594, "grad_norm": 0.536896288394928, "kl": 0.05155404657125473, "learning_rate": 1.531875e-06, "loss": -0.0099, "num_tokens": 99552863.0, "reward": 1.5041189193725586, "reward_std": 0.042749807238578796, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.504118800163269, "rewards/correct_reward_func/std": 0.11967188119888306, "step": 760 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2180.0, "completions/max_terminated_length": 2180.0, "completions/mean_length": 1572.3214111328125, "completions/mean_terminated_length": 1572.3214111328125, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 1.1853582554517135, "grad_norm": 0.654563307762146, "kl": 0.06319995783269405, "learning_rate": 1.53125e-06, "loss": -0.0178, "num_tokens": 99690944.0, "reward": 1.5238220691680908, "reward_std": 0.07384101301431656, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5238221287727356, "rewards/correct_reward_func/std": 0.14978064596652985, "step": 761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2468.0, "completions/mean_length": 1673.5833740234375, "completions/mean_terminated_length": 1514.5975341796875, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 1.1869158878504673, "grad_norm": 0.6024786233901978, "kl": 0.05431670695543289, "learning_rate": 1.5306249999999999e-06, "loss": 0.0323, "num_tokens": 99837525.0, "reward": 1.456794261932373, "reward_std": 0.12691636383533478, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4686989486217499, "rewards/correct_reward_func/std": 0.18105122447013855, "step": 762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 1636.9285888671875, "completions/mean_terminated_length": 1557.9517822265625, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "epoch": 1.1884735202492211, "grad_norm": 0.5757026076316833, "kl": 0.05303757078945637, "learning_rate": 1.53e-06, "loss": 0.0344, "num_tokens": 99980949.0, "reward": 1.5024532079696655, "reward_std": 0.09310808032751083, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5143579244613647, "rewards/correct_reward_func/std": 0.16830983757972717, "step": 763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6305.0, "completions/max_terminated_length": 6305.0, "completions/mean_length": 1643.8214111328125, "completions/mean_terminated_length": 1643.8214111328125, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "epoch": 1.190031152647975, "grad_norm": 0.5751224160194397, "kl": 0.051150599494576454, "learning_rate": 1.5293749999999999e-06, "loss": 0.0619, "num_tokens": 100124964.0, "reward": 1.460810899734497, "reward_std": 0.07619532197713852, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46081089973449707, "rewards/correct_reward_func/std": 0.1678754985332489, "step": 764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2436.0, "completions/max_terminated_length": 2436.0, "completions/mean_length": 1611.1905517578125, "completions/mean_terminated_length": 1611.1905517578125, "completions/min_length": 1068.0, "completions/min_terminated_length": 1068.0, "epoch": 1.191588785046729, "grad_norm": 0.560835599899292, "kl": 0.05420506186783314, "learning_rate": 1.52875e-06, "loss": 0.0125, "num_tokens": 100266358.0, "reward": 1.4359450340270996, "reward_std": 0.12514643371105194, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4597545862197876, "rewards/correct_reward_func/std": 0.14009708166122437, "step": 765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3119.0, "completions/max_terminated_length": 3119.0, "completions/mean_length": 1526.59521484375, "completions/mean_terminated_length": 1526.59521484375, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "epoch": 1.1931464174454829, "grad_norm": 0.6195681095123291, "kl": 0.055776726454496384, "learning_rate": 1.5281249999999999e-06, "loss": -0.0237, "num_tokens": 100400232.0, "reward": 1.5460232496261597, "reward_std": 0.07064041495323181, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5460231900215149, "rewards/correct_reward_func/std": 0.16844241321086884, "step": 766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2578.0, "completions/max_terminated_length": 2578.0, "completions/mean_length": 1550.90478515625, "completions/mean_terminated_length": 1550.90478515625, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 1.1947040498442367, "grad_norm": 0.6004679799079895, "kl": 0.054388487711548805, "learning_rate": 1.5275e-06, "loss": 0.0207, "num_tokens": 100536700.0, "reward": 1.5176633596420288, "reward_std": 0.07351231575012207, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5176632404327393, "rewards/correct_reward_func/std": 0.1611538827419281, "step": 767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 1581.1905517578125, "completions/mean_terminated_length": 1581.1905517578125, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 1.1962616822429906, "grad_norm": 0.6167070269584656, "kl": 0.0536374356597662, "learning_rate": 1.5268749999999999e-06, "loss": 0.0052, "num_tokens": 100675316.0, "reward": 1.4759693145751953, "reward_std": 0.05997619405388832, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47596922516822815, "rewards/correct_reward_func/std": 0.12434441596269608, "step": 768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2657.0, "completions/max_terminated_length": 2657.0, "completions/mean_length": 1585.3095703125, "completions/mean_terminated_length": 1585.3095703125, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 1.1978193146417446, "grad_norm": 0.5705325603485107, "kl": 0.05308514088392258, "learning_rate": 1.52625e-06, "loss": -0.0086, "num_tokens": 100814392.0, "reward": 1.4863255023956299, "reward_std": 0.06346695870161057, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4863254427909851, "rewards/correct_reward_func/std": 0.16247457265853882, "step": 769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 1588.40478515625, "completions/mean_terminated_length": 1588.40478515625, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "epoch": 1.1993769470404985, "grad_norm": 0.5928447246551514, "kl": 0.059964681044220924, "learning_rate": 1.525625e-06, "loss": 0.0165, "num_tokens": 100953746.0, "reward": 1.4094767570495605, "reward_std": 0.10904749482870102, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.43328621983528137, "rewards/correct_reward_func/std": 0.16408738493919373, "step": 770 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2691.0, "completions/max_terminated_length": 2691.0, "completions/mean_length": 1599.1190185546875, "completions/mean_terminated_length": 1599.1190185546875, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "epoch": 1.2009345794392523, "grad_norm": 0.5530892014503479, "kl": 0.054923782125115395, "learning_rate": 1.5249999999999998e-06, "loss": -0.0012, "num_tokens": 101094066.0, "reward": 1.5157004594802856, "reward_std": 0.08739394694566727, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5276051759719849, "rewards/correct_reward_func/std": 0.14603330194950104, "step": 771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2726.0, "completions/max_terminated_length": 2726.0, "completions/mean_length": 1564.0, "completions/mean_terminated_length": 1564.0, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 1.2024922118380061, "grad_norm": 0.600959062576294, "kl": 0.05353361554443836, "learning_rate": 1.524375e-06, "loss": -0.0121, "num_tokens": 101231214.0, "reward": 1.5206904411315918, "reward_std": 0.06717798113822937, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5206903219223022, "rewards/correct_reward_func/std": 0.1800759881734848, "step": 772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 1631.8214111328125, "completions/mean_terminated_length": 1552.7830810546875, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 1.2040498442367602, "grad_norm": 0.591049075126648, "kl": 0.05524212867021561, "learning_rate": 1.5237499999999998e-06, "loss": 0.0453, "num_tokens": 101374383.0, "reward": 1.4854098558425903, "reward_std": 0.07667335867881775, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4854099750518799, "rewards/correct_reward_func/std": 0.15584157407283783, "step": 773 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 1677.2857666015625, "completions/mean_terminated_length": 1598.795166015625, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 1.205607476635514, "grad_norm": 0.5642656683921814, "kl": 0.05100255832076073, "learning_rate": 1.523125e-06, "loss": 0.0284, "num_tokens": 101521269.0, "reward": 1.45835280418396, "reward_std": 0.05762210488319397, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4583527743816376, "rewards/correct_reward_func/std": 0.13773544132709503, "step": 774 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 1732.1905517578125, "completions/mean_terminated_length": 1654.361328125, "completions/min_length": 1083.0, "completions/min_terminated_length": 1083.0, "epoch": 1.2071651090342679, "grad_norm": 0.554004430770874, "kl": 0.05230839550495148, "learning_rate": 1.5224999999999998e-06, "loss": 0.0666, "num_tokens": 101672821.0, "reward": 1.4651694297790527, "reward_std": 0.07731819152832031, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4651694595813751, "rewards/correct_reward_func/std": 0.17472968995571136, "step": 775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1597.261962890625, "completions/mean_terminated_length": 1597.261962890625, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 1.2087227414330217, "grad_norm": 0.5966112017631531, "kl": 0.05261442996561527, "learning_rate": 1.521875e-06, "loss": 0.0023, "num_tokens": 101812859.0, "reward": 1.4308266639709473, "reward_std": 0.10777775943279266, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4427313506603241, "rewards/correct_reward_func/std": 0.1671162247657776, "step": 776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 1608.9761962890625, "completions/mean_terminated_length": 1608.9761962890625, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 1.2102803738317758, "grad_norm": 0.5703216195106506, "kl": 0.05495970882475376, "learning_rate": 1.5212499999999998e-06, "loss": 0.0232, "num_tokens": 101954199.0, "reward": 1.5345309972763062, "reward_std": 0.06536292284727097, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5345310568809509, "rewards/correct_reward_func/std": 0.11891984939575195, "step": 777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 1717.4285888671875, "completions/mean_terminated_length": 1639.421630859375, "completions/min_length": 1117.0, "completions/min_terminated_length": 1117.0, "epoch": 1.2118380062305296, "grad_norm": 0.5498438477516174, "kl": 0.05176936648786068, "learning_rate": 1.5206249999999997e-06, "loss": 0.0517, "num_tokens": 102104439.0, "reward": 1.4425933361053467, "reward_std": 0.11579939723014832, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4664028584957123, "rewards/correct_reward_func/std": 0.1846941113471985, "step": 778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2423.0, "completions/max_terminated_length": 2423.0, "completions/mean_length": 1562.7381591796875, "completions/mean_terminated_length": 1562.7381591796875, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 1.2133956386292835, "grad_norm": 0.5859647393226624, "kl": 0.05293230712413788, "learning_rate": 1.5199999999999998e-06, "loss": -0.0041, "num_tokens": 102241631.0, "reward": 1.4774600267410278, "reward_std": 0.05781900882720947, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4774600565433502, "rewards/correct_reward_func/std": 0.12344729900360107, "step": 779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 1499.5714111328125, "completions/mean_terminated_length": 1499.5714111328125, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 1.2149532710280373, "grad_norm": 0.6209301948547363, "kl": 0.05409579910337925, "learning_rate": 1.5193749999999997e-06, "loss": -0.0065, "num_tokens": 102373565.0, "reward": 1.4805545806884766, "reward_std": 0.07722529768943787, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4805544912815094, "rewards/correct_reward_func/std": 0.19578884541988373, "step": 780 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1559.8214111328125, "completions/mean_terminated_length": 1559.8214111328125, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 1.2165109034267914, "grad_norm": 0.5819864869117737, "kl": 0.05286850035190582, "learning_rate": 1.51875e-06, "loss": 0.0105, "num_tokens": 102510716.0, "reward": 1.4312827587127686, "reward_std": 0.07989110052585602, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44318753480911255, "rewards/correct_reward_func/std": 0.11926353722810745, "step": 781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 1587.0833740234375, "completions/mean_terminated_length": 1507.5059814453125, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 1.2180685358255452, "grad_norm": 0.5988348126411438, "kl": 0.051638057455420494, "learning_rate": 1.518125e-06, "loss": 0.0794, "num_tokens": 102649935.0, "reward": 1.5066888332366943, "reward_std": 0.08049983531236649, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5066885948181152, "rewards/correct_reward_func/std": 0.1656782031059265, "step": 782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 1501.9761962890625, "completions/mean_terminated_length": 1501.9761962890625, "completions/min_length": 1014.0, "completions/min_terminated_length": 1014.0, "epoch": 1.219626168224299, "grad_norm": 0.5800265669822693, "kl": 0.05443391762673855, "learning_rate": 1.5175e-06, "loss": -0.0117, "num_tokens": 102782029.0, "reward": 1.5396227836608887, "reward_std": 0.06960324198007584, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5396227240562439, "rewards/correct_reward_func/std": 0.1666199415922165, "step": 783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2060.0, "completions/mean_length": 1564.4761962890625, "completions/mean_terminated_length": 1484.62646484375, "completions/min_length": 515.0, "completions/min_terminated_length": 515.0, "epoch": 1.2211838006230529, "grad_norm": 0.5702518224716187, "kl": 0.05206519551575184, "learning_rate": 1.516875e-06, "loss": 0.0765, "num_tokens": 102919433.0, "reward": 1.438029408454895, "reward_std": 0.07640068233013153, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4380292296409607, "rewards/correct_reward_func/std": 0.15191110968589783, "step": 784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2243.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 1460.09521484375, "completions/mean_terminated_length": 1460.09521484375, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "epoch": 1.222741433021807, "grad_norm": 0.648040771484375, "kl": 0.056264620274305344, "learning_rate": 1.51625e-06, "loss": -0.0015, "num_tokens": 103048027.0, "reward": 1.4138339757919312, "reward_std": 0.07412144541740417, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.413833886384964, "rewards/correct_reward_func/std": 0.09973495453596115, "step": 785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 1595.1190185546875, "completions/mean_terminated_length": 1595.1190185546875, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "epoch": 1.2242990654205608, "grad_norm": 0.5783217549324036, "kl": 0.05306544341146946, "learning_rate": 1.515625e-06, "loss": 0.0194, "num_tokens": 103187927.0, "reward": 1.461424708366394, "reward_std": 0.08573352545499802, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47332945466041565, "rewards/correct_reward_func/std": 0.11087380349636078, "step": 786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 1453.5833740234375, "completions/mean_terminated_length": 1453.5833740234375, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 1.2258566978193146, "grad_norm": 0.5750983953475952, "kl": 0.052959585562348366, "learning_rate": 1.515e-06, "loss": 0.0158, "num_tokens": 103315854.0, "reward": 1.4556964635849, "reward_std": 0.0923253521323204, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4676012396812439, "rewards/correct_reward_func/std": 0.11101017147302628, "step": 787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 1472.0357666015625, "completions/mean_terminated_length": 1472.0357666015625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 1.2274143302180685, "grad_norm": 0.5796962976455688, "kl": 0.054285503923892975, "learning_rate": 1.514375e-06, "loss": -0.0063, "num_tokens": 103445481.0, "reward": 1.4726245403289795, "reward_std": 0.05445545166730881, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4726243317127228, "rewards/correct_reward_func/std": 0.16170987486839294, "step": 788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 1448.916748046875, "completions/mean_terminated_length": 1448.916748046875, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 1.2289719626168225, "grad_norm": 0.6210680603981018, "kl": 0.052961185574531555, "learning_rate": 1.51375e-06, "loss": -0.0273, "num_tokens": 103573094.0, "reward": 1.514704942703247, "reward_std": 0.07752978056669235, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5147048830986023, "rewards/correct_reward_func/std": 0.15763655304908752, "step": 789 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1471.1429443359375, "completions/mean_terminated_length": 1390.1685791015625, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 1.2305295950155763, "grad_norm": 0.5890511870384216, "kl": 0.052992820739746094, "learning_rate": 1.513125e-06, "loss": 0.0698, "num_tokens": 103702586.0, "reward": 1.4430148601531982, "reward_std": 0.06676966696977615, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44301486015319824, "rewards/correct_reward_func/std": 0.11303845793008804, "step": 790 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 1537.4881591796875, "completions/mean_terminated_length": 1537.4881591796875, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "epoch": 1.2320872274143302, "grad_norm": 0.5926904678344727, "kl": 0.05367530323565006, "learning_rate": 1.5125e-06, "loss": 0.0364, "num_tokens": 103837849.0, "reward": 1.4797751903533936, "reward_std": 0.08245012909173965, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4916798174381256, "rewards/correct_reward_func/std": 0.16100813448429108, "step": 791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 1451.3095703125, "completions/mean_terminated_length": 1451.3095703125, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 1.233644859813084, "grad_norm": 0.6183095574378967, "kl": 0.054510580375790596, "learning_rate": 1.511875e-06, "loss": -0.0093, "num_tokens": 103965813.0, "reward": 1.561307430267334, "reward_std": 0.07561670243740082, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.573212206363678, "rewards/correct_reward_func/std": 0.1895819902420044, "step": 792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 1376.7381591796875, "completions/mean_terminated_length": 1376.7381591796875, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 1.235202492211838, "grad_norm": 0.5449400544166565, "kl": 0.05295031704008579, "learning_rate": 1.51125e-06, "loss": -0.0281, "num_tokens": 104087249.0, "reward": 1.4594392776489258, "reward_std": 0.07898024469614029, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47134390473365784, "rewards/correct_reward_func/std": 0.20434388518333435, "step": 793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 1471.702392578125, "completions/mean_terminated_length": 1471.702392578125, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 1.236760124610592, "grad_norm": 0.589128315448761, "kl": 0.05422963760793209, "learning_rate": 1.510625e-06, "loss": 0.0004, "num_tokens": 104217034.0, "reward": 1.392501711845398, "reward_std": 0.060993742197752, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3925016522407532, "rewards/correct_reward_func/std": 0.11965753883123398, "step": 794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2075.0, "completions/max_terminated_length": 2075.0, "completions/mean_length": 1451.5238037109375, "completions/mean_terminated_length": 1451.5238037109375, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 1.2383177570093458, "grad_norm": 0.598427414894104, "kl": 0.054455362260341644, "learning_rate": 1.51e-06, "loss": 0.0074, "num_tokens": 104345004.0, "reward": 1.540875792503357, "reward_std": 0.0700727179646492, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5408757328987122, "rewards/correct_reward_func/std": 0.15536341071128845, "step": 795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 1472.4285888671875, "completions/mean_terminated_length": 1472.4285888671875, "completions/min_length": 676.0, "completions/min_terminated_length": 676.0, "epoch": 1.2398753894080996, "grad_norm": 0.6184532046318054, "kl": 0.052908554673194885, "learning_rate": 1.5093749999999998e-06, "loss": -0.0219, "num_tokens": 104474592.0, "reward": 1.5032671689987183, "reward_std": 0.0605204813182354, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5032671689987183, "rewards/correct_reward_func/std": 0.16684666275978088, "step": 796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1998.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1442.8095703125, "completions/mean_terminated_length": 1442.8095703125, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 1.2414330218068537, "grad_norm": 0.5926938652992249, "kl": 0.053903037682175636, "learning_rate": 1.50875e-06, "loss": 0.0118, "num_tokens": 104601602.0, "reward": 1.4345961809158325, "reward_std": 0.10142410546541214, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4465009272098541, "rewards/correct_reward_func/std": 0.18178771436214447, "step": 797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 1514.2381591796875, "completions/mean_terminated_length": 1433.7830810546875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 1.2429906542056075, "grad_norm": 0.5866039991378784, "kl": 0.05417381040751934, "learning_rate": 1.5081249999999999e-06, "loss": 0.0796, "num_tokens": 104734858.0, "reward": 1.463507890701294, "reward_std": 0.10972185432910919, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47541263699531555, "rewards/correct_reward_func/std": 0.16103945672512054, "step": 798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2461.0, "completions/max_terminated_length": 2461.0, "completions/mean_length": 1449.166748046875, "completions/mean_terminated_length": 1449.166748046875, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 1.2445482866043613, "grad_norm": 0.6338303685188293, "kl": 0.05405573919415474, "learning_rate": 1.5075e-06, "loss": -0.0178, "num_tokens": 104862576.0, "reward": 1.49515700340271, "reward_std": 0.06454695761203766, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4951569736003876, "rewards/correct_reward_func/std": 0.12489238381385803, "step": 799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2126.0, "completions/max_terminated_length": 2126.0, "completions/mean_length": 1346.6905517578125, "completions/mean_terminated_length": 1346.6905517578125, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 1.2461059190031152, "grad_norm": 0.6101464033126831, "kl": 0.05465280823409557, "learning_rate": 1.5068749999999999e-06, "loss": -0.0081, "num_tokens": 104981644.0, "reward": 1.4770976305007935, "reward_std": 0.05906420946121216, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.477097749710083, "rewards/correct_reward_func/std": 0.1510917842388153, "step": 800 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1417.34521484375, "completions/mean_terminated_length": 1417.34521484375, "completions/min_length": 512.0, "completions/min_terminated_length": 512.0, "epoch": 1.2476635514018692, "grad_norm": 0.578747570514679, "kl": 0.0536639466881752, "learning_rate": 1.50625e-06, "loss": -0.0087, "num_tokens": 105106893.0, "reward": 1.496096134185791, "reward_std": 0.08548455685377121, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5080008506774902, "rewards/correct_reward_func/std": 0.1339438557624817, "step": 801 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 1397.0357666015625, "completions/mean_terminated_length": 1397.0357666015625, "completions/min_length": 682.0, "completions/min_terminated_length": 682.0, "epoch": 1.249221183800623, "grad_norm": 0.5749726295471191, "kl": 0.05151655338704586, "learning_rate": 1.5056249999999999e-06, "loss": 0.0041, "num_tokens": 105230040.0, "reward": 1.5566080808639526, "reward_std": 0.060173988342285156, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5566080212593079, "rewards/correct_reward_func/std": 0.13863953948020935, "step": 802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 1415.857177734375, "completions/mean_terminated_length": 1415.857177734375, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 1.250778816199377, "grad_norm": 0.5830151438713074, "kl": 0.05197390355169773, "learning_rate": 1.5049999999999998e-06, "loss": 0.012, "num_tokens": 105355122.0, "reward": 1.4777034521102905, "reward_std": 0.08437201380729675, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4896082282066345, "rewards/correct_reward_func/std": 0.17309200763702393, "step": 803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1495.96435546875, "completions/mean_terminated_length": 1415.2890625, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 1.2523364485981308, "grad_norm": 0.5742617845535278, "kl": 0.05096680857241154, "learning_rate": 1.5043749999999999e-06, "loss": 0.0779, "num_tokens": 105486867.0, "reward": 1.4874447584152222, "reward_std": 0.08846469223499298, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49934953451156616, "rewards/correct_reward_func/std": 0.18881730735301971, "step": 804 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 1402.2381591796875, "completions/mean_terminated_length": 1402.2381591796875, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "epoch": 1.2538940809968846, "grad_norm": 0.5524767637252808, "kl": 0.05113302171230316, "learning_rate": 1.5037499999999998e-06, "loss": 0.0275, "num_tokens": 105610739.0, "reward": 1.5092952251434326, "reward_std": 0.0812416523694992, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5211998820304871, "rewards/correct_reward_func/std": 0.14597436785697937, "step": 805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1420.2381591796875, "completions/mean_terminated_length": 1420.2381591796875, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 1.2554517133956387, "grad_norm": 0.6224563121795654, "kl": 0.05363850295543671, "learning_rate": 1.503125e-06, "loss": 0.0492, "num_tokens": 105736045.0, "reward": 1.503407597541809, "reward_std": 0.06171470135450363, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5034075379371643, "rewards/correct_reward_func/std": 0.09481731802225113, "step": 806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2732.0, "completions/max_terminated_length": 2732.0, "completions/mean_length": 1451.0714111328125, "completions/mean_terminated_length": 1451.0714111328125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 1.2570093457943925, "grad_norm": 0.5917273759841919, "kl": 0.05363740585744381, "learning_rate": 1.5024999999999998e-06, "loss": -0.02, "num_tokens": 105863953.0, "reward": 1.4894373416900635, "reward_std": 0.08506947755813599, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48943719267845154, "rewards/correct_reward_func/std": 0.12193943560123444, "step": 807 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2006.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1370.9881591796875, "completions/mean_terminated_length": 1370.9881591796875, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 1.2585669781931463, "grad_norm": 0.6407949328422546, "kl": 0.060399917885661125, "learning_rate": 1.501875e-06, "loss": -0.0369, "num_tokens": 105985110.0, "reward": 1.5478301048278809, "reward_std": 0.09143880754709244, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5597350001335144, "rewards/correct_reward_func/std": 0.18025587499141693, "step": 808 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2024.0, "completions/max_terminated_length": 2024.0, "completions/mean_length": 1356.6190185546875, "completions/mean_terminated_length": 1356.6190185546875, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 1.2601246105919004, "grad_norm": 0.606340229511261, "kl": 0.05455312877893448, "learning_rate": 1.5012499999999998e-06, "loss": 0.0102, "num_tokens": 106104976.0, "reward": 1.470739483833313, "reward_std": 0.09121835976839066, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4826441705226898, "rewards/correct_reward_func/std": 0.1137034073472023, "step": 809 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 1529.71435546875, "completions/mean_terminated_length": 1449.4456787109375, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "epoch": 1.2616822429906542, "grad_norm": 0.591900646686554, "kl": 0.05042143538594246, "learning_rate": 1.500625e-06, "loss": 0.0802, "num_tokens": 106239712.0, "reward": 1.4532712697982788, "reward_std": 0.06579039990901947, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45327118039131165, "rewards/correct_reward_func/std": 0.1620456874370575, "step": 810 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1979.0, "completions/max_terminated_length": 1979.0, "completions/mean_length": 1393.09521484375, "completions/mean_terminated_length": 1393.09521484375, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 1.263239875389408, "grad_norm": 0.5970065593719482, "kl": 0.05317961238324642, "learning_rate": 1.5e-06, "loss": 0.0041, "num_tokens": 106362738.0, "reward": 1.5704677104949951, "reward_std": 0.04940271005034447, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.570467472076416, "rewards/correct_reward_func/std": 0.14148163795471191, "step": 811 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2174.0, "completions/mean_length": 1395.2261962890625, "completions/mean_terminated_length": 1313.3372802734375, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 1.264797507788162, "grad_norm": 0.5777062177658081, "kl": 0.05635930597782135, "learning_rate": 1.499375e-06, "loss": 0.064, "num_tokens": 106485817.0, "reward": 1.482456088066101, "reward_std": 0.08446712046861649, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4824560284614563, "rewards/correct_reward_func/std": 0.15840278565883636, "step": 812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 1249.5833740234375, "completions/mean_terminated_length": 1249.5833740234375, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 1.2663551401869158, "grad_norm": 0.6452962160110474, "kl": 0.05812252499163151, "learning_rate": 1.49875e-06, "loss": -0.0111, "num_tokens": 106596608.0, "reward": 1.5400220155715942, "reward_std": 0.06712926179170609, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5400220155715942, "rewards/correct_reward_func/std": 0.16041254997253418, "step": 813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 1315.6190185546875, "completions/mean_terminated_length": 1315.6190185546875, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 1.2679127725856698, "grad_norm": 0.6085140109062195, "kl": 0.055513957515358925, "learning_rate": 1.498125e-06, "loss": 0.0139, "num_tokens": 106712964.0, "reward": 1.5179225206375122, "reward_std": 0.06849826127290726, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5179225206375122, "rewards/correct_reward_func/std": 0.14082752168178558, "step": 814 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2171.0, "completions/max_terminated_length": 2171.0, "completions/mean_length": 1369.9761962890625, "completions/mean_terminated_length": 1369.9761962890625, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 1.2694704049844237, "grad_norm": 0.6515775918960571, "kl": 0.05368403159081936, "learning_rate": 1.4975e-06, "loss": 0.0201, "num_tokens": 106833964.0, "reward": 1.4772425889968872, "reward_std": 0.058473389595746994, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47724252939224243, "rewards/correct_reward_func/std": 0.15556859970092773, "step": 815 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 1300.46435546875, "completions/mean_terminated_length": 1300.46435546875, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 1.2710280373831775, "grad_norm": 0.6149557828903198, "kl": 0.05302148498594761, "learning_rate": 1.496875e-06, "loss": 0.0146, "num_tokens": 106948987.0, "reward": 1.4856795072555542, "reward_std": 0.060432400554418564, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48567935824394226, "rewards/correct_reward_func/std": 0.12423360347747803, "step": 816 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1733.0, "completions/mean_length": 1321.7381591796875, "completions/mean_terminated_length": 1238.9638671875, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "epoch": 1.2725856697819315, "grad_norm": 0.6380515098571777, "kl": 0.05285250209271908, "learning_rate": 1.49625e-06, "loss": 0.0367, "num_tokens": 107065911.0, "reward": 1.52854323387146, "reward_std": 0.07963281869888306, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.52854323387146, "rewards/correct_reward_func/std": 0.163214311003685, "step": 817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1988.0, "completions/max_terminated_length": 1988.0, "completions/mean_length": 1327.3214111328125, "completions/mean_terminated_length": 1327.3214111328125, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 1.2741433021806854, "grad_norm": 0.5899630188941956, "kl": 0.055711204186081886, "learning_rate": 1.495625e-06, "loss": 0.0106, "num_tokens": 107183286.0, "reward": 1.465398907661438, "reward_std": 0.08405665308237076, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47730350494384766, "rewards/correct_reward_func/std": 0.12836016714572906, "step": 818 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1286.8809814453125, "completions/mean_terminated_length": 1286.8809814453125, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 1.2757009345794392, "grad_norm": 0.6295702457427979, "kl": 0.05397116579115391, "learning_rate": 1.495e-06, "loss": -0.0118, "num_tokens": 107297330.0, "reward": 1.5398021936416626, "reward_std": 0.06337767839431763, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5398020148277283, "rewards/correct_reward_func/std": 0.16977693140506744, "step": 819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1920.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1257.5833740234375, "completions/mean_terminated_length": 1257.5833740234375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 1.277258566978193, "grad_norm": 0.6716676950454712, "kl": 0.05769355781376362, "learning_rate": 1.494375e-06, "loss": 0.021, "num_tokens": 107408961.0, "reward": 1.5272583961486816, "reward_std": 0.07724601030349731, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5272584557533264, "rewards/correct_reward_func/std": 0.18930970132350922, "step": 820 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1747.0, "completions/max_terminated_length": 1747.0, "completions/mean_length": 1267.5595703125, "completions/mean_terminated_length": 1267.5595703125, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 1.278816199376947, "grad_norm": 0.6954907774925232, "kl": 0.05500246211886406, "learning_rate": 1.4937499999999999e-06, "loss": 0.0035, "num_tokens": 107521244.0, "reward": 1.4823615550994873, "reward_std": 0.09682151675224304, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49426618218421936, "rewards/correct_reward_func/std": 0.12725263833999634, "step": 821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2168.0, "completions/max_terminated_length": 2168.0, "completions/mean_length": 1253.8214111328125, "completions/mean_terminated_length": 1253.8214111328125, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 1.280373831775701, "grad_norm": 0.6106709241867065, "kl": 0.057063210755586624, "learning_rate": 1.493125e-06, "loss": -0.0043, "num_tokens": 107632637.0, "reward": 1.5474696159362793, "reward_std": 0.05892669036984444, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5474694967269897, "rewards/correct_reward_func/std": 0.18005988001823425, "step": 822 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1972.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1297.5833740234375, "completions/mean_terminated_length": 1297.5833740234375, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 1.2819314641744548, "grad_norm": 0.6388299465179443, "kl": 0.05787203088402748, "learning_rate": 1.4925e-06, "loss": 0.0234, "num_tokens": 107747502.0, "reward": 1.4410191774368286, "reward_std": 0.08891406655311584, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4529239237308502, "rewards/correct_reward_func/std": 0.16428311169147491, "step": 823 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1907.0, "completions/max_terminated_length": 1907.0, "completions/mean_length": 1179.7381591796875, "completions/mean_terminated_length": 1179.7381591796875, "completions/min_length": 439.0, "completions/min_terminated_length": 439.0, "epoch": 1.2834890965732086, "grad_norm": 0.6635016798973083, "kl": 0.057330962270498276, "learning_rate": 1.491875e-06, "loss": 0.0241, "num_tokens": 107852444.0, "reward": 1.5518784523010254, "reward_std": 0.08353918045759201, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5518783926963806, "rewards/correct_reward_func/std": 0.2062029391527176, "step": 824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 1347.357177734375, "completions/mean_terminated_length": 1264.8914794921875, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 1.2850467289719627, "grad_norm": 0.578759491443634, "kl": 0.0571975726634264, "learning_rate": 1.49125e-06, "loss": 0.0484, "num_tokens": 107971940.0, "reward": 1.4378589391708374, "reward_std": 0.0568789541721344, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43785884976387024, "rewards/correct_reward_func/std": 0.16381777822971344, "step": 825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 1215.1785888671875, "completions/mean_terminated_length": 1215.1785888671875, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 1.2866043613707165, "grad_norm": 0.6516785025596619, "kl": 0.0553536731749773, "learning_rate": 1.490625e-06, "loss": -0.0034, "num_tokens": 108080075.0, "reward": 1.6036394834518433, "reward_std": 0.07341770082712173, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.6036393046379089, "rewards/correct_reward_func/std": 0.19790509343147278, "step": 826 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2067.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 1232.4881591796875, "completions/mean_terminated_length": 1232.4881591796875, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 1.2881619937694704, "grad_norm": 0.6247161030769348, "kl": 0.05350677669048309, "learning_rate": 1.49e-06, "loss": 0.0071, "num_tokens": 108189694.0, "reward": 1.6057114601135254, "reward_std": 0.08387381583452225, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.6176161766052246, "rewards/correct_reward_func/std": 0.17051324248313904, "step": 827 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1795.0, "completions/max_terminated_length": 1795.0, "completions/mean_length": 1229.8095703125, "completions/mean_terminated_length": 1229.8095703125, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 1.2897196261682242, "grad_norm": 0.6240764260292053, "kl": 0.05720374546945095, "learning_rate": 1.4893749999999998e-06, "loss": 0.005, "num_tokens": 108298830.0, "reward": 1.4638899564743042, "reward_std": 0.06280633807182312, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46388983726501465, "rewards/correct_reward_func/std": 0.15485362708568573, "step": 828 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1769.0, "completions/max_terminated_length": 1769.0, "completions/mean_length": 1241.0595703125, "completions/mean_terminated_length": 1241.0595703125, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "epoch": 1.291277258566978, "grad_norm": 0.6447198987007141, "kl": 0.0591514203697443, "learning_rate": 1.48875e-06, "loss": -0.0257, "num_tokens": 108409031.0, "reward": 1.48125422000885, "reward_std": 0.07750996947288513, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49315887689590454, "rewards/correct_reward_func/std": 0.11840316653251648, "step": 829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1859.0, "completions/max_terminated_length": 1859.0, "completions/mean_length": 1285.5238037109375, "completions/mean_terminated_length": 1285.5238037109375, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 1.2928348909657321, "grad_norm": 0.6621662378311157, "kl": 0.05617973953485489, "learning_rate": 1.4881249999999998e-06, "loss": -0.0091, "num_tokens": 108522991.0, "reward": 1.5105698108673096, "reward_std": 0.09365297853946686, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5105698108673096, "rewards/correct_reward_func/std": 0.152457132935524, "step": 830 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1764.0, "completions/mean_length": 1278.5357666015625, "completions/mean_terminated_length": 1195.240966796875, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 1.294392523364486, "grad_norm": 0.6176079511642456, "kl": 0.057270659133791924, "learning_rate": 1.4875e-06, "loss": 0.056, "num_tokens": 108636298.0, "reward": 1.4974220991134644, "reward_std": 0.1067122295498848, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5093267560005188, "rewards/correct_reward_func/std": 0.14272964000701904, "step": 831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1805.0, "completions/max_terminated_length": 1805.0, "completions/mean_length": 1236.9405517578125, "completions/mean_terminated_length": 1236.9405517578125, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "epoch": 1.2959501557632398, "grad_norm": 0.6839401721954346, "kl": 0.057625722140073776, "learning_rate": 1.4868749999999998e-06, "loss": 0.0086, "num_tokens": 108746207.0, "reward": 1.515440821647644, "reward_std": 0.06921108812093735, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.515440821647644, "rewards/correct_reward_func/std": 0.17127959430217743, "step": 832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1264.7381591796875, "completions/mean_terminated_length": 1264.7381591796875, "completions/min_length": 647.0, "completions/min_terminated_length": 647.0, "epoch": 1.2975077881619939, "grad_norm": 0.6479074954986572, "kl": 0.05918855965137482, "learning_rate": 1.48625e-06, "loss": -0.0112, "num_tokens": 108858493.0, "reward": 1.4901705980300903, "reward_std": 0.07281164079904556, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49017032980918884, "rewards/correct_reward_func/std": 0.12836848199367523, "step": 833 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 1280.96435546875, "completions/mean_terminated_length": 1280.96435546875, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 1.2990654205607477, "grad_norm": 0.6369271278381348, "kl": 0.059232594445347786, "learning_rate": 1.4856249999999999e-06, "loss": -0.0057, "num_tokens": 108971920.0, "reward": 1.4865796566009521, "reward_std": 0.04098751023411751, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.486579567193985, "rewards/correct_reward_func/std": 0.15600472688674927, "step": 834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1817.0, "completions/max_terminated_length": 1817.0, "completions/mean_length": 1249.5, "completions/mean_terminated_length": 1249.5, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 1.3006230529595015, "grad_norm": 0.6544007658958435, "kl": 0.06083432957530022, "learning_rate": 1.485e-06, "loss": 0.0176, "num_tokens": 109082914.0, "reward": 1.4533076286315918, "reward_std": 0.04274992644786835, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.453307569026947, "rewards/correct_reward_func/std": 0.07037007808685303, "step": 835 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1300.666748046875, "completions/mean_terminated_length": 1300.666748046875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 1.3021806853582554, "grad_norm": 0.7110933065414429, "kl": 0.058566370978951454, "learning_rate": 1.4843749999999999e-06, "loss": 0.0251, "num_tokens": 109198068.0, "reward": 1.5509032011032104, "reward_std": 0.07329439371824265, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5509031414985657, "rewards/correct_reward_func/std": 0.13326621055603027, "step": 836 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1811.0, "completions/max_terminated_length": 1811.0, "completions/mean_length": 1270.547607421875, "completions/mean_terminated_length": 1270.547607421875, "completions/min_length": 505.0, "completions/min_terminated_length": 505.0, "epoch": 1.3037383177570092, "grad_norm": 0.6527374982833862, "kl": 0.05826532281935215, "learning_rate": 1.4837499999999998e-06, "loss": -0.0102, "num_tokens": 109310794.0, "reward": 1.4442270994186401, "reward_std": 0.07275613397359848, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44422703981399536, "rewards/correct_reward_func/std": 0.16298054158687592, "step": 837 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1242.5, "completions/mean_terminated_length": 1242.5, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 1.3052959501557633, "grad_norm": 0.6311480402946472, "kl": 0.056207481771707535, "learning_rate": 1.4831249999999999e-06, "loss": 0.038, "num_tokens": 109421086.0, "reward": 1.5303537845611572, "reward_std": 0.09045805782079697, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5422585010528564, "rewards/correct_reward_func/std": 0.14229103922843933, "step": 838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1834.0, "completions/mean_length": 1387.40478515625, "completions/mean_terminated_length": 1305.421630859375, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 1.3068535825545171, "grad_norm": 0.6146191358566284, "kl": 0.05974440835416317, "learning_rate": 1.4824999999999998e-06, "loss": 0.0755, "num_tokens": 109543604.0, "reward": 1.4635041952133179, "reward_std": 0.05726633965969086, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4635040760040283, "rewards/correct_reward_func/std": 0.19411805272102356, "step": 839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1278.2381591796875, "completions/mean_terminated_length": 1278.2381591796875, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 1.308411214953271, "grad_norm": 0.6642119288444519, "kl": 0.05755990743637085, "learning_rate": 1.4818749999999999e-06, "loss": 0.0148, "num_tokens": 109656970.0, "reward": 1.5296529531478882, "reward_std": 0.06168491020798683, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5296528935432434, "rewards/correct_reward_func/std": 0.13547885417938232, "step": 840 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 1326.0833740234375, "completions/mean_terminated_length": 1326.0833740234375, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 1.309968847352025, "grad_norm": 0.6326037645339966, "kl": 0.058037688955664635, "learning_rate": 1.4812499999999998e-06, "loss": 0.0139, "num_tokens": 109774409.0, "reward": 1.532925009727478, "reward_std": 0.07814683765172958, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.532925009727478, "rewards/correct_reward_func/std": 0.12932780385017395, "step": 841 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 1351.0833740234375, "completions/mean_terminated_length": 1351.0833740234375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 1.3115264797507789, "grad_norm": 0.659521222114563, "kl": 0.05781222693622112, "learning_rate": 1.4806250000000001e-06, "loss": 0.0293, "num_tokens": 109893816.0, "reward": 1.50190269947052, "reward_std": 0.05357246845960617, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5019026398658752, "rewards/correct_reward_func/std": 0.10981904715299606, "step": 842 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1890.0, "completions/max_terminated_length": 1890.0, "completions/mean_length": 1332.1309814453125, "completions/mean_terminated_length": 1332.1309814453125, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 1.3130841121495327, "grad_norm": 0.6595703363418579, "kl": 0.058373432606458664, "learning_rate": 1.48e-06, "loss": 0.0093, "num_tokens": 110011709.0, "reward": 1.4661359786987305, "reward_std": 0.0949120968580246, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4780406355857849, "rewards/correct_reward_func/std": 0.17147719860076904, "step": 843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1961.0, "completions/mean_length": 1432.0833740234375, "completions/mean_terminated_length": 1350.6385498046875, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 1.3146417445482865, "grad_norm": 0.5944724678993225, "kl": 0.05312122218310833, "learning_rate": 1.4793750000000001e-06, "loss": 0.0612, "num_tokens": 110138094.0, "reward": 1.4195321798324585, "reward_std": 0.08428820967674255, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43143680691719055, "rewards/correct_reward_func/std": 0.14636452496051788, "step": 844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 1291.2738037109375, "completions/mean_terminated_length": 1291.2738037109375, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 1.3161993769470404, "grad_norm": 0.6563733816146851, "kl": 0.05528195761144161, "learning_rate": 1.47875e-06, "loss": 0.0025, "num_tokens": 110252579.0, "reward": 1.5054223537445068, "reward_std": 0.07998127490282059, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5173270106315613, "rewards/correct_reward_func/std": 0.1488352119922638, "step": 845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 1312.8690185546875, "completions/mean_terminated_length": 1312.8690185546875, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "epoch": 1.3177570093457944, "grad_norm": 0.6194365620613098, "kl": 0.056114861741662025, "learning_rate": 1.478125e-06, "loss": -0.0027, "num_tokens": 110368476.0, "reward": 1.5306884050369263, "reward_std": 0.10831733047962189, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5425930023193359, "rewards/correct_reward_func/std": 0.2073424756526947, "step": 846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2197.0, "completions/max_terminated_length": 2197.0, "completions/mean_length": 1374.71435546875, "completions/mean_terminated_length": 1374.71435546875, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 1.3193146417445483, "grad_norm": 0.6330597996711731, "kl": 0.05779997259378433, "learning_rate": 1.4775e-06, "loss": -0.0103, "num_tokens": 110489814.0, "reward": 1.5173296928405762, "reward_std": 0.0651247426867485, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5173295140266418, "rewards/correct_reward_func/std": 0.16119088232517242, "step": 847 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1886.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 1317.9285888671875, "completions/mean_terminated_length": 1317.9285888671875, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 1.320872274143302, "grad_norm": 0.6775298714637756, "kl": 0.057511694729328156, "learning_rate": 1.476875e-06, "loss": -0.0073, "num_tokens": 110606496.0, "reward": 1.4734554290771484, "reward_std": 0.06864751875400543, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48536020517349243, "rewards/correct_reward_func/std": 0.12160845100879669, "step": 848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2728.0, "completions/max_terminated_length": 2728.0, "completions/mean_length": 1380.5714111328125, "completions/mean_terminated_length": 1380.5714111328125, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 1.3224299065420562, "grad_norm": 0.6578128933906555, "kl": 0.05723660998046398, "learning_rate": 1.47625e-06, "loss": 0.0149, "num_tokens": 110728416.0, "reward": 1.5106651782989502, "reward_std": 0.08196483552455902, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5225698351860046, "rewards/correct_reward_func/std": 0.13148510456085205, "step": 849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1335.011962890625, "completions/mean_terminated_length": 1335.011962890625, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 1.32398753894081, "grad_norm": 0.6308766603469849, "kl": 0.05739885754883289, "learning_rate": 1.475625e-06, "loss": -0.0057, "num_tokens": 110846599.0, "reward": 1.5422691106796265, "reward_std": 0.07354159653186798, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5422690510749817, "rewards/correct_reward_func/std": 0.1881309300661087, "step": 850 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1929.0, "completions/max_terminated_length": 1929.0, "completions/mean_length": 1297.7261962890625, "completions/mean_terminated_length": 1297.7261962890625, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 1.3255451713395638, "grad_norm": 0.6275732517242432, "kl": 0.05609885975718498, "learning_rate": 1.475e-06, "loss": 0.0015, "num_tokens": 110961572.0, "reward": 1.4340226650238037, "reward_std": 0.06935743987560272, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43402254581451416, "rewards/correct_reward_func/std": 0.14177773892879486, "step": 851 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3288.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 1362.3690185546875, "completions/mean_terminated_length": 1362.3690185546875, "completions/min_length": 386.0, "completions/min_terminated_length": 386.0, "epoch": 1.3271028037383177, "grad_norm": 0.6543543338775635, "kl": 0.05645536072552204, "learning_rate": 1.474375e-06, "loss": -0.0023, "num_tokens": 111082083.0, "reward": 1.4616303443908691, "reward_std": 0.058727361261844635, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46163028478622437, "rewards/correct_reward_func/std": 0.1665804088115692, "step": 852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2182.0, "completions/max_terminated_length": 2182.0, "completions/mean_length": 1356.6309814453125, "completions/mean_terminated_length": 1356.6309814453125, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 1.3286604361370715, "grad_norm": 0.6333216428756714, "kl": 0.05571644380688667, "learning_rate": 1.4737499999999999e-06, "loss": -0.0176, "num_tokens": 111201842.0, "reward": 1.50204336643219, "reward_std": 0.1097206398844719, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5139480233192444, "rewards/correct_reward_func/std": 0.14730492234230042, "step": 853 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1841.0, "completions/max_terminated_length": 1841.0, "completions/mean_length": 1305.357177734375, "completions/mean_terminated_length": 1305.357177734375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "epoch": 1.3302180685358256, "grad_norm": 0.609614908695221, "kl": 0.05513115040957928, "learning_rate": 1.473125e-06, "loss": -0.0089, "num_tokens": 111317348.0, "reward": 1.4387062788009644, "reward_std": 0.07816533744335175, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4506109356880188, "rewards/correct_reward_func/std": 0.15743528306484222, "step": 854 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2007.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1395.8095703125, "completions/mean_terminated_length": 1395.8095703125, "completions/min_length": 445.0, "completions/min_terminated_length": 445.0, "epoch": 1.3317757009345794, "grad_norm": 0.6440364122390747, "kl": 0.05603250488638878, "learning_rate": 1.4724999999999999e-06, "loss": 0.0074, "num_tokens": 111440536.0, "reward": 1.5060646533966064, "reward_std": 0.08835386484861374, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5179694294929504, "rewards/correct_reward_func/std": 0.16371630132198334, "step": 855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 1438.797607421875, "completions/mean_terminated_length": 1357.4337158203125, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 1.3333333333333333, "grad_norm": 0.6040071845054626, "kl": 0.053471729159355164, "learning_rate": 1.471875e-06, "loss": 0.0891, "num_tokens": 111567347.0, "reward": 1.4874982833862305, "reward_std": 0.06487350910902023, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48749813437461853, "rewards/correct_reward_func/std": 0.13729378581047058, "step": 856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1822.0, "completions/max_terminated_length": 1822.0, "completions/mean_length": 1341.666748046875, "completions/mean_terminated_length": 1341.666748046875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 1.3348909657320873, "grad_norm": 0.6194965839385986, "kl": 0.05670303851366043, "learning_rate": 1.4712499999999999e-06, "loss": 0.0087, "num_tokens": 111685999.0, "reward": 1.4822922945022583, "reward_std": 0.08171102404594421, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49419695138931274, "rewards/correct_reward_func/std": 0.1873791217803955, "step": 857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2233.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 1427.7857666015625, "completions/mean_terminated_length": 1427.7857666015625, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 1.3364485981308412, "grad_norm": 0.5877569317817688, "kl": 0.057048603892326355, "learning_rate": 1.470625e-06, "loss": 0.0054, "num_tokens": 111811975.0, "reward": 1.4544936418533325, "reward_std": 0.08619140088558197, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4663982391357422, "rewards/correct_reward_func/std": 0.132582426071167, "step": 858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2095.0, "completions/mean_length": 1622.6905517578125, "completions/mean_terminated_length": 1462.46337890625, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 1.338006230529595, "grad_norm": 0.5447506904602051, "kl": 0.05207224562764168, "learning_rate": 1.47e-06, "loss": 0.0919, "num_tokens": 111954347.0, "reward": 1.4136794805526733, "reward_std": 0.06224596127867699, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41367945075035095, "rewards/correct_reward_func/std": 0.10898096859455109, "step": 859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1358.952392578125, "completions/mean_terminated_length": 1358.952392578125, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 1.3395638629283488, "grad_norm": 0.6685717105865479, "kl": 0.05506133660674095, "learning_rate": 1.469375e-06, "loss": -0.0011, "num_tokens": 112074397.0, "reward": 1.5232875347137451, "reward_std": 0.06414449214935303, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5232874751091003, "rewards/correct_reward_func/std": 0.1500086933374405, "step": 860 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 1401.857177734375, "completions/mean_terminated_length": 1401.857177734375, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 1.3411214953271027, "grad_norm": 0.6013727784156799, "kl": 0.057697853073477745, "learning_rate": 1.46875e-06, "loss": 0.0113, "num_tokens": 112198117.0, "reward": 1.4316805601119995, "reward_std": 0.11122133582830429, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337099134922028, "rewards/correct_reward_func/mean": 0.45549002289772034, "rewards/correct_reward_func/std": 0.10769819468259811, "step": 861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2133.0, "completions/max_terminated_length": 2133.0, "completions/mean_length": 1401.2381591796875, "completions/mean_terminated_length": 1401.2381591796875, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 1.3426791277258567, "grad_norm": 0.6382337808609009, "kl": 0.05703101493418217, "learning_rate": 1.4681249999999998e-06, "loss": 0.0161, "num_tokens": 112321749.0, "reward": 1.479724407196045, "reward_std": 0.08678054809570312, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4916289150714874, "rewards/correct_reward_func/std": 0.18822021782398224, "step": 862 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 1404.15478515625, "completions/mean_terminated_length": 1404.15478515625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 1.3442367601246106, "grad_norm": 0.6245453357696533, "kl": 0.05679042264819145, "learning_rate": 1.4675e-06, "loss": -0.0176, "num_tokens": 112445644.0, "reward": 1.47727632522583, "reward_std": 0.0804123654961586, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4891809821128845, "rewards/correct_reward_func/std": 0.14304745197296143, "step": 863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2276.0, "completions/max_terminated_length": 2276.0, "completions/mean_length": 1472.8929443359375, "completions/mean_terminated_length": 1472.8929443359375, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 1.3457943925233644, "grad_norm": 0.600166380405426, "kl": 0.054330646991729736, "learning_rate": 1.4668749999999998e-06, "loss": -0.0012, "num_tokens": 112575379.0, "reward": 1.4825719594955444, "reward_std": 0.07509178668260574, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49447664618492126, "rewards/correct_reward_func/std": 0.13370554149150848, "step": 864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4206.0, "completions/max_terminated_length": 4206.0, "completions/mean_length": 1449.5833740234375, "completions/mean_terminated_length": 1449.5833740234375, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "epoch": 1.3473520249221185, "grad_norm": 0.6423432230949402, "kl": 0.05620681308209896, "learning_rate": 1.46625e-06, "loss": 0.0338, "num_tokens": 112703228.0, "reward": 1.5245707035064697, "reward_std": 0.07068542391061783, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5245704650878906, "rewards/correct_reward_func/std": 0.13473057746887207, "step": 865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1876.0, "completions/max_terminated_length": 1876.0, "completions/mean_length": 1393.261962890625, "completions/mean_terminated_length": 1393.261962890625, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 1.3489096573208723, "grad_norm": 0.6431348323822021, "kl": 0.05682270601391792, "learning_rate": 1.4656249999999998e-06, "loss": 0.005, "num_tokens": 112826262.0, "reward": 1.4459993839263916, "reward_std": 0.08421668410301208, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4579041004180908, "rewards/correct_reward_func/std": 0.12991267442703247, "step": 866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 1389.3214111328125, "completions/mean_terminated_length": 1389.3214111328125, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 1.3504672897196262, "grad_norm": 0.6164113879203796, "kl": 0.05660521984100342, "learning_rate": 1.465e-06, "loss": -0.0031, "num_tokens": 112948947.0, "reward": 1.522879719734192, "reward_std": 0.06196318566799164, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5228796601295471, "rewards/correct_reward_func/std": 0.13261692225933075, "step": 867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 1468.011962890625, "completions/mean_terminated_length": 1468.011962890625, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 1.35202492211838, "grad_norm": 0.6326475739479065, "kl": 0.06204744055867195, "learning_rate": 1.4643749999999998e-06, "loss": -0.0284, "num_tokens": 113078236.0, "reward": 1.5423513650894165, "reward_std": 0.06403238326311111, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5423513054847717, "rewards/correct_reward_func/std": 0.13513179123401642, "step": 868 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1371.857177734375, "completions/mean_terminated_length": 1371.857177734375, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 1.3535825545171338, "grad_norm": 0.7463195323944092, "kl": 0.05850627273321152, "learning_rate": 1.46375e-06, "loss": 0.0226, "num_tokens": 113199424.0, "reward": 1.5003184080123901, "reward_std": 0.07682616263628006, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.512222945690155, "rewards/correct_reward_func/std": 0.16638441383838654, "step": 869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 1451.2381591796875, "completions/mean_terminated_length": 1451.2381591796875, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 1.355140186915888, "grad_norm": 0.6129419803619385, "kl": 0.057191383093595505, "learning_rate": 1.4631249999999999e-06, "loss": -0.0131, "num_tokens": 113327178.0, "reward": 1.4611753225326538, "reward_std": 0.053040098398923874, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46117523312568665, "rewards/correct_reward_func/std": 0.09122917056083679, "step": 870 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1417.3214111328125, "completions/mean_terminated_length": 1417.3214111328125, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 1.3566978193146417, "grad_norm": 0.5949195623397827, "kl": 0.05857289209961891, "learning_rate": 1.4624999999999998e-06, "loss": -0.0059, "num_tokens": 113452233.0, "reward": 1.5169587135314941, "reward_std": 0.09305848926305771, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5288633704185486, "rewards/correct_reward_func/std": 0.1506938487291336, "step": 871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 1465.6190185546875, "completions/mean_terminated_length": 1465.6190185546875, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 1.3582554517133956, "grad_norm": 0.589340329170227, "kl": 0.055778129026293755, "learning_rate": 1.4618749999999999e-06, "loss": 0.006, "num_tokens": 113581249.0, "reward": 1.4750484228134155, "reward_std": 0.06620852649211884, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47504857182502747, "rewards/correct_reward_func/std": 0.14469914138317108, "step": 872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1466.3214111328125, "completions/mean_terminated_length": 1466.3214111328125, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "epoch": 1.3598130841121496, "grad_norm": 0.6115531921386719, "kl": 0.058202724903821945, "learning_rate": 1.4612499999999998e-06, "loss": -0.0216, "num_tokens": 113710480.0, "reward": 1.5128999948501587, "reward_std": 0.06866886466741562, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5128997564315796, "rewards/correct_reward_func/std": 0.15225443243980408, "step": 873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 1332.5, "completions/mean_terminated_length": 1332.5, "completions/min_length": 514.0, "completions/min_terminated_length": 514.0, "epoch": 1.3613707165109035, "grad_norm": 0.6277832388877869, "kl": 0.05491664633154869, "learning_rate": 1.460625e-06, "loss": -0.0155, "num_tokens": 113828302.0, "reward": 1.5254782438278198, "reward_std": 0.07950825989246368, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5373828411102295, "rewards/correct_reward_func/std": 0.17209453880786896, "step": 874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 1458.797607421875, "completions/mean_terminated_length": 1458.797607421875, "completions/min_length": 388.0, "completions/min_terminated_length": 388.0, "epoch": 1.3629283489096573, "grad_norm": 0.5984455943107605, "kl": 0.05571623332798481, "learning_rate": 1.46e-06, "loss": 0.0094, "num_tokens": 113957063.0, "reward": 1.4808282852172852, "reward_std": 0.05077481269836426, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.480828195810318, "rewards/correct_reward_func/std": 0.0890769213438034, "step": 875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 1520.8095703125, "completions/mean_terminated_length": 1520.8095703125, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 1.3644859813084111, "grad_norm": 0.5540310144424438, "kl": 0.0549551360309124, "learning_rate": 1.4593750000000001e-06, "loss": -0.0332, "num_tokens": 114090757.0, "reward": 1.6168055534362793, "reward_std": 0.07043300569057465, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.616805374622345, "rewards/correct_reward_func/std": 0.15268778800964355, "step": 876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2330.0, "completions/max_terminated_length": 2330.0, "completions/mean_length": 1447.011962890625, "completions/mean_terminated_length": 1447.011962890625, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "epoch": 1.366043613707165, "grad_norm": 0.6382185816764832, "kl": 0.05652775056660175, "learning_rate": 1.45875e-06, "loss": -0.0207, "num_tokens": 114218492.0, "reward": 1.5179688930511475, "reward_std": 0.05837404355406761, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5179687738418579, "rewards/correct_reward_func/std": 0.15670737624168396, "step": 877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2051.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 1398.6905517578125, "completions/mean_terminated_length": 1398.6905517578125, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 1.367601246105919, "grad_norm": 0.6205227971076965, "kl": 0.05711596459150314, "learning_rate": 1.458125e-06, "loss": 0.015, "num_tokens": 114341958.0, "reward": 1.4727941751480103, "reward_std": 0.08140215277671814, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48469892144203186, "rewards/correct_reward_func/std": 0.15082986652851105, "step": 878 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2216.0, "completions/max_terminated_length": 2216.0, "completions/mean_length": 1415.547607421875, "completions/mean_terminated_length": 1415.547607421875, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 1.3691588785046729, "grad_norm": 0.654856264591217, "kl": 0.06367814727127552, "learning_rate": 1.4575e-06, "loss": -0.0018, "num_tokens": 114466792.0, "reward": 1.5041035413742065, "reward_std": 0.0731503814458847, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5041035413742065, "rewards/correct_reward_func/std": 0.16042949259281158, "step": 879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 1604.8809814453125, "completions/mean_terminated_length": 1525.51806640625, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 1.3707165109034267, "grad_norm": 0.5901898145675659, "kl": 0.053472839295864105, "learning_rate": 1.456875e-06, "loss": 0.1171, "num_tokens": 114607632.0, "reward": 1.4930131435394287, "reward_std": 0.061025217175483704, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49301305413246155, "rewards/correct_reward_func/std": 0.17567700147628784, "step": 880 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 1483.047607421875, "completions/mean_terminated_length": 1483.047607421875, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 1.3722741433021808, "grad_norm": 0.6013805866241455, "kl": 0.05910382606089115, "learning_rate": 1.45625e-06, "loss": 0.005, "num_tokens": 114738118.0, "reward": 1.4590173959732056, "reward_std": 0.0662660300731659, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4590173363685608, "rewards/correct_reward_func/std": 0.13709169626235962, "step": 881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1531.6190185546875, "completions/mean_terminated_length": 1451.3734130859375, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 1.3738317757009346, "grad_norm": 0.5727903842926025, "kl": 0.0533680971711874, "learning_rate": 1.455625e-06, "loss": 0.0665, "num_tokens": 114872846.0, "reward": 1.4882384538650513, "reward_std": 0.07433220744132996, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48823848366737366, "rewards/correct_reward_func/std": 0.1290288120508194, "step": 882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2529.0, "completions/max_terminated_length": 2529.0, "completions/mean_length": 1426.4405517578125, "completions/mean_terminated_length": 1426.4405517578125, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 1.3753894080996885, "grad_norm": 0.6256980299949646, "kl": 0.055480191484093666, "learning_rate": 1.455e-06, "loss": 0.0078, "num_tokens": 114998487.0, "reward": 1.432407259941101, "reward_std": 0.07890105247497559, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44431182742118835, "rewards/correct_reward_func/std": 0.1913682222366333, "step": 883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1378.5238037109375, "completions/mean_terminated_length": 1378.5238037109375, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 1.3769470404984423, "grad_norm": 0.620612621307373, "kl": 0.05804718658328056, "learning_rate": 1.454375e-06, "loss": 0.0077, "num_tokens": 115120097.0, "reward": 1.4862065315246582, "reward_std": 0.09935810416936874, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5100159645080566, "rewards/correct_reward_func/std": 0.17950013279914856, "step": 884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 1424.6785888671875, "completions/mean_terminated_length": 1424.6785888671875, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 1.3785046728971961, "grad_norm": 0.6544032692909241, "kl": 0.05744820274412632, "learning_rate": 1.45375e-06, "loss": -0.0128, "num_tokens": 115245758.0, "reward": 1.4994765520095825, "reward_std": 0.0722532644867897, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4994765818119049, "rewards/correct_reward_func/std": 0.15166634321212769, "step": 885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1410.6309814453125, "completions/mean_terminated_length": 1410.6309814453125, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 1.3800623052959502, "grad_norm": 0.623815655708313, "kl": 0.05730650760233402, "learning_rate": 1.453125e-06, "loss": -0.0151, "num_tokens": 115370275.0, "reward": 1.4487056732177734, "reward_std": 0.0821453407406807, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46061044931411743, "rewards/correct_reward_func/std": 0.11904603987932205, "step": 886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 1495.84521484375, "completions/mean_terminated_length": 1495.84521484375, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 1.381619937694704, "grad_norm": 0.5747067928314209, "kl": 0.054390305653214455, "learning_rate": 1.4524999999999999e-06, "loss": 0.0183, "num_tokens": 115501944.0, "reward": 1.41459321975708, "reward_std": 0.05369851738214493, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4145931601524353, "rewards/correct_reward_func/std": 0.12552371621131897, "step": 887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 1451.1429443359375, "completions/mean_terminated_length": 1451.1429443359375, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 1.3831775700934579, "grad_norm": 0.6081941723823547, "kl": 0.05580301955342293, "learning_rate": 1.451875e-06, "loss": 0.0039, "num_tokens": 115629768.0, "reward": 1.484035849571228, "reward_std": 0.08495806902647018, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4959404468536377, "rewards/correct_reward_func/std": 0.11834685504436493, "step": 888 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 1355.6309814453125, "completions/mean_terminated_length": 1355.6309814453125, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 1.384735202492212, "grad_norm": 0.6069492697715759, "kl": 0.05641558952629566, "learning_rate": 1.4512499999999999e-06, "loss": -0.001, "num_tokens": 115749629.0, "reward": 1.490574836730957, "reward_std": 0.11412316560745239, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5143842101097107, "rewards/correct_reward_func/std": 0.16944913566112518, "step": 889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 1427.166748046875, "completions/mean_terminated_length": 1427.166748046875, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 1.3862928348909658, "grad_norm": 0.5850852727890015, "kl": 0.05718538165092468, "learning_rate": 1.450625e-06, "loss": 0.0275, "num_tokens": 115875469.0, "reward": 1.4617379903793335, "reward_std": 0.05744781345129013, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4617379605770111, "rewards/correct_reward_func/std": 0.16021159291267395, "step": 890 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 5390.0, "completions/mean_length": 1574.702392578125, "completions/mean_terminated_length": 1494.975830078125, "completions/min_length": 674.0, "completions/min_terminated_length": 674.0, "epoch": 1.3878504672897196, "grad_norm": 0.5539150238037109, "kl": 0.05292163975536823, "learning_rate": 1.4499999999999999e-06, "loss": 0.0007, "num_tokens": 116013858.0, "reward": 1.4686534404754639, "reward_std": 0.07236435264348984, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46865344047546387, "rewards/correct_reward_func/std": 0.1547231674194336, "step": 891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 1426.0595703125, "completions/mean_terminated_length": 1426.0595703125, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 1.3894080996884735, "grad_norm": 0.602260947227478, "kl": 0.05691239610314369, "learning_rate": 1.449375e-06, "loss": 0.0264, "num_tokens": 116139581.0, "reward": 1.5643341541290283, "reward_std": 0.07589580118656158, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5762388110160828, "rewards/correct_reward_func/std": 0.1481904685497284, "step": 892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1520.46435546875, "completions/mean_terminated_length": 1440.084228515625, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "epoch": 1.3909657320872273, "grad_norm": 0.5521963238716125, "kl": 0.05354543216526508, "learning_rate": 1.4487499999999999e-06, "loss": 0.0503, "num_tokens": 116273258.0, "reward": 1.5151633024215698, "reward_std": 0.07917681336402893, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5151633024215698, "rewards/correct_reward_func/std": 0.18867127597332, "step": 893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1908.0, "completions/max_terminated_length": 1908.0, "completions/mean_length": 1375.4405517578125, "completions/mean_terminated_length": 1375.4405517578125, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 1.3925233644859814, "grad_norm": 0.6409580707550049, "kl": 0.05660186521708965, "learning_rate": 1.448125e-06, "loss": 0.0337, "num_tokens": 116394747.0, "reward": 1.4822630882263184, "reward_std": 0.11477495729923248, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5060726404190063, "rewards/correct_reward_func/std": 0.16251885890960693, "step": 894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2157.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 1406.8214111328125, "completions/mean_terminated_length": 1406.8214111328125, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 1.3940809968847352, "grad_norm": 0.621264636516571, "kl": 0.0567406564950943, "learning_rate": 1.4475e-06, "loss": -0.0128, "num_tokens": 116519028.0, "reward": 1.526440978050232, "reward_std": 0.05866561830043793, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5264409184455872, "rewards/correct_reward_func/std": 0.16541090607643127, "step": 895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 1403.797607421875, "completions/mean_terminated_length": 1403.797607421875, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 1.395638629283489, "grad_norm": 0.5699082016944885, "kl": 0.05645635910332203, "learning_rate": 1.4468749999999998e-06, "loss": -0.0042, "num_tokens": 116643007.0, "reward": 1.4814952611923218, "reward_std": 0.09797579795122147, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49340003728866577, "rewards/correct_reward_func/std": 0.15578578412532806, "step": 896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1974.0, "completions/max_terminated_length": 1974.0, "completions/mean_length": 1432.7381591796875, "completions/mean_terminated_length": 1432.7381591796875, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 1.397196261682243, "grad_norm": 0.5962051153182983, "kl": 0.058391373604536057, "learning_rate": 1.44625e-06, "loss": -0.0116, "num_tokens": 116769363.0, "reward": 1.4879692792892456, "reward_std": 0.06595655530691147, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4879692494869232, "rewards/correct_reward_func/std": 0.1603183150291443, "step": 897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 1414.7381591796875, "completions/mean_terminated_length": 1414.7381591796875, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "epoch": 1.398753894080997, "grad_norm": 0.6030511856079102, "kl": 0.05669163726270199, "learning_rate": 1.4456249999999998e-06, "loss": 0.0005, "num_tokens": 116894093.0, "reward": 1.5083913803100586, "reward_std": 0.06333743780851364, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.508391261100769, "rewards/correct_reward_func/std": 0.14766350388526917, "step": 898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 1426.547607421875, "completions/mean_terminated_length": 1426.547607421875, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 1.4003115264797508, "grad_norm": 0.6138181090354919, "kl": 0.05802357941865921, "learning_rate": 1.445e-06, "loss": -0.0105, "num_tokens": 117019887.0, "reward": 1.5139949321746826, "reward_std": 0.06553500145673752, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5139948129653931, "rewards/correct_reward_func/std": 0.1458451747894287, "step": 899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2804.0, "completions/max_terminated_length": 2804.0, "completions/mean_length": 1399.797607421875, "completions/mean_terminated_length": 1399.797607421875, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 1.4018691588785046, "grad_norm": 0.5804871916770935, "kl": 0.056105878204107285, "learning_rate": 1.4443749999999998e-06, "loss": -0.02, "num_tokens": 117143236.0, "reward": 1.4730557203292847, "reward_std": 0.08534087985754013, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4849604666233063, "rewards/correct_reward_func/std": 0.17888279259204865, "step": 900 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2294.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 1447.8095703125, "completions/mean_terminated_length": 1447.8095703125, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 1.4034267912772584, "grad_norm": 0.594947338104248, "kl": 0.05691775679588318, "learning_rate": 1.44375e-06, "loss": -0.0176, "num_tokens": 117270804.0, "reward": 1.5226647853851318, "reward_std": 0.05812466889619827, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5226647257804871, "rewards/correct_reward_func/std": 0.16493678092956543, "step": 901 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2055.0, "completions/max_terminated_length": 2055.0, "completions/mean_length": 1454.84521484375, "completions/mean_terminated_length": 1454.84521484375, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 1.4049844236760125, "grad_norm": 0.6105215549468994, "kl": 0.055627038702368736, "learning_rate": 1.4431249999999998e-06, "loss": 0.0234, "num_tokens": 117398993.0, "reward": 1.511657953262329, "reward_std": 0.07968111336231232, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5235626101493835, "rewards/correct_reward_func/std": 0.19582140445709229, "step": 902 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 1449.2738037109375, "completions/mean_terminated_length": 1449.2738037109375, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 1.4065420560747663, "grad_norm": 0.6099538803100586, "kl": 0.05739925429224968, "learning_rate": 1.4424999999999997e-06, "loss": 0.0104, "num_tokens": 117526546.0, "reward": 1.4571181535720825, "reward_std": 0.05378911644220352, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45711803436279297, "rewards/correct_reward_func/std": 0.17163367569446564, "step": 903 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1423.46435546875, "completions/mean_terminated_length": 1423.46435546875, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 1.4080996884735202, "grad_norm": 0.6613585352897644, "kl": 0.05594133213162422, "learning_rate": 1.4418749999999999e-06, "loss": 0.0143, "num_tokens": 117651985.0, "reward": 1.5938717126846313, "reward_std": 0.09149576723575592, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.6057763695716858, "rewards/correct_reward_func/std": 0.17281414568424225, "step": 904 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2432.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 1427.8929443359375, "completions/mean_terminated_length": 1427.8929443359375, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 1.4096573208722742, "grad_norm": 0.599256157875061, "kl": 0.05670376680791378, "learning_rate": 1.4412499999999998e-06, "loss": -0.004, "num_tokens": 117777766.0, "reward": 1.4417251348495483, "reward_std": 0.05539141595363617, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44172510504722595, "rewards/correct_reward_func/std": 0.154262974858284, "step": 905 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 1446.9881591796875, "completions/mean_terminated_length": 1446.9881591796875, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "epoch": 1.411214953271028, "grad_norm": 0.6012739539146423, "kl": 0.056648531928658485, "learning_rate": 1.440625e-06, "loss": -0.0113, "num_tokens": 117905475.0, "reward": 1.5076979398727417, "reward_std": 0.05255338177084923, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5076977610588074, "rewards/correct_reward_func/std": 0.09542722254991531, "step": 906 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2560.0, "completions/max_terminated_length": 2560.0, "completions/mean_length": 1487.6309814453125, "completions/mean_terminated_length": 1487.6309814453125, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 1.412772585669782, "grad_norm": 0.6180278062820435, "kl": 0.05427748151123524, "learning_rate": 1.44e-06, "loss": 0.008, "num_tokens": 118036496.0, "reward": 1.4639973640441895, "reward_std": 0.050627920776605606, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4639973044395447, "rewards/correct_reward_func/std": 0.12111470848321915, "step": 907 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2148.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 1465.1309814453125, "completions/mean_terminated_length": 1465.1309814453125, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 1.4143302180685358, "grad_norm": 0.6335111856460571, "kl": 0.05609702318906784, "learning_rate": 1.439375e-06, "loss": -0.0209, "num_tokens": 118165507.0, "reward": 1.506919026374817, "reward_std": 0.0581449456512928, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5069187879562378, "rewards/correct_reward_func/std": 0.13004226982593536, "step": 908 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 1502.166748046875, "completions/mean_terminated_length": 1502.166748046875, "completions/min_length": 998.0, "completions/min_terminated_length": 998.0, "epoch": 1.4158878504672896, "grad_norm": 0.593975841999054, "kl": 0.05645776726305485, "learning_rate": 1.43875e-06, "loss": 0.0092, "num_tokens": 118297701.0, "reward": 1.514088749885559, "reward_std": 0.048670992255210876, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5140886902809143, "rewards/correct_reward_func/std": 0.14365530014038086, "step": 909 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1453.8333740234375, "completions/mean_terminated_length": 1453.8333740234375, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 1.4174454828660437, "grad_norm": 0.6036884784698486, "kl": 0.056601958349347115, "learning_rate": 1.438125e-06, "loss": -0.0204, "num_tokens": 118425571.0, "reward": 1.5547549724578857, "reward_std": 0.07500748336315155, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5547547936439514, "rewards/correct_reward_func/std": 0.18868432939052582, "step": 910 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 1575.3690185546875, "completions/mean_terminated_length": 1575.3690185546875, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 1.4190031152647975, "grad_norm": 0.5665627121925354, "kl": 0.05619748495519161, "learning_rate": 1.4375e-06, "loss": -0.0093, "num_tokens": 118564100.0, "reward": 1.5020114183425903, "reward_std": 0.05269896984100342, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5020114183425903, "rewards/correct_reward_func/std": 0.12081959843635559, "step": 911 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 1533.5357666015625, "completions/mean_terminated_length": 1533.5357666015625, "completions/min_length": 36.0, "completions/min_terminated_length": 36.0, "epoch": 1.4205607476635513, "grad_norm": 0.5786104202270508, "kl": 0.05535414442420006, "learning_rate": 1.436875e-06, "loss": -0.0255, "num_tokens": 118698941.0, "reward": 1.4640154838562012, "reward_std": 0.09987229853868484, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47592025995254517, "rewards/correct_reward_func/std": 0.14402800798416138, "step": 912 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1648.761962890625, "completions/mean_terminated_length": 1569.9276123046875, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 1.4221183800623054, "grad_norm": 0.5720868706703186, "kl": 0.05730193480849266, "learning_rate": 1.43625e-06, "loss": 0.0438, "num_tokens": 118843305.0, "reward": 1.4377235174179077, "reward_std": 0.06128935515880585, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4377233386039734, "rewards/correct_reward_func/std": 0.1441432535648346, "step": 913 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 1558.7261962890625, "completions/mean_terminated_length": 1558.7261962890625, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 1.4236760124610592, "grad_norm": 0.5719354152679443, "kl": 0.058466263115406036, "learning_rate": 1.435625e-06, "loss": -0.0014, "num_tokens": 118980376.0, "reward": 1.4291819334030151, "reward_std": 0.05418519675731659, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4291818141937256, "rewards/correct_reward_func/std": 0.15738485753536224, "step": 914 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1490.1309814453125, "completions/mean_terminated_length": 1490.1309814453125, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 1.425233644859813, "grad_norm": 0.5989186763763428, "kl": 0.058671485632658005, "learning_rate": 1.435e-06, "loss": 0.0112, "num_tokens": 119111343.0, "reward": 1.5451024770736694, "reward_std": 0.05530606210231781, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5451023578643799, "rewards/correct_reward_func/std": 0.17648662626743317, "step": 915 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 1545.2738037109375, "completions/mean_terminated_length": 1545.2738037109375, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 1.426791277258567, "grad_norm": 0.5661653876304626, "kl": 0.056517476215958595, "learning_rate": 1.434375e-06, "loss": 0.0031, "num_tokens": 119247140.0, "reward": 1.530543565750122, "reward_std": 0.0579085648059845, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5305435061454773, "rewards/correct_reward_func/std": 0.15737612545490265, "step": 916 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 1632.5595703125, "completions/mean_terminated_length": 1553.530029296875, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 1.4283489096573208, "grad_norm": 0.6139397025108337, "kl": 0.053893081843853, "learning_rate": 1.43375e-06, "loss": 0.0572, "num_tokens": 119390173.0, "reward": 1.4624112844467163, "reward_std": 0.07330044358968735, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.462411105632782, "rewards/correct_reward_func/std": 0.12606181204319, "step": 917 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 1509.857177734375, "completions/mean_terminated_length": 1509.857177734375, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 1.4299065420560748, "grad_norm": 0.5906069278717041, "kl": 0.057032231241464615, "learning_rate": 1.433125e-06, "loss": 0.0076, "num_tokens": 119522905.0, "reward": 1.5514806509017944, "reward_std": 0.10114779323339462, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5752902030944824, "rewards/correct_reward_func/std": 0.14105330407619476, "step": 918 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3559.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 1607.0357666015625, "completions/mean_terminated_length": 1607.0357666015625, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 1.4314641744548287, "grad_norm": 0.5491380095481873, "kl": 0.05519489012658596, "learning_rate": 1.4325e-06, "loss": -0.0233, "num_tokens": 119663956.0, "reward": 1.4916614294052124, "reward_std": 0.05572972074151039, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49166131019592285, "rewards/correct_reward_func/std": 0.13992977142333984, "step": 919 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 1600.90478515625, "completions/mean_terminated_length": 1600.90478515625, "completions/min_length": 925.0, "completions/min_terminated_length": 925.0, "epoch": 1.4330218068535825, "grad_norm": 0.6002023816108704, "kl": 0.05795757472515106, "learning_rate": 1.431875e-06, "loss": 0.0273, "num_tokens": 119804522.0, "reward": 1.5251212120056152, "reward_std": 0.12058699131011963, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5489307641983032, "rewards/correct_reward_func/std": 0.18103274703025818, "step": 920 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 1590.4761962890625, "completions/mean_terminated_length": 1590.4761962890625, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 1.4345794392523366, "grad_norm": 0.6025384664535522, "kl": 0.05592229776084423, "learning_rate": 1.4312499999999998e-06, "loss": 0.0242, "num_tokens": 119944182.0, "reward": 1.4718756675720215, "reward_std": 0.08662988990545273, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4837804138660431, "rewards/correct_reward_func/std": 0.1548565775156021, "step": 921 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 1562.107177734375, "completions/mean_terminated_length": 1562.107177734375, "completions/min_length": 495.0, "completions/min_terminated_length": 495.0, "epoch": 1.4361370716510904, "grad_norm": 0.5856416821479797, "kl": 0.05425183288753033, "learning_rate": 1.430625e-06, "loss": 0.0058, "num_tokens": 120081471.0, "reward": 1.5170137882232666, "reward_std": 0.05324134603142738, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.517013669013977, "rewards/correct_reward_func/std": 0.15991690754890442, "step": 922 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2468.0, "completions/max_terminated_length": 2468.0, "completions/mean_length": 1503.34521484375, "completions/mean_terminated_length": 1503.34521484375, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 1.4376947040498442, "grad_norm": 0.588824450969696, "kl": 0.054391030222177505, "learning_rate": 1.4299999999999999e-06, "loss": 0.0126, "num_tokens": 120213752.0, "reward": 1.4910145998001099, "reward_std": 0.08497285097837448, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5029193162918091, "rewards/correct_reward_func/std": 0.1430082470178604, "step": 923 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2786.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 1606.4405517578125, "completions/mean_terminated_length": 1606.4405517578125, "completions/min_length": 1057.0, "completions/min_terminated_length": 1057.0, "epoch": 1.439252336448598, "grad_norm": 0.5846059918403625, "kl": 0.054474322125315666, "learning_rate": 1.429375e-06, "loss": -0.0071, "num_tokens": 120354603.0, "reward": 1.4693561792373657, "reward_std": 0.09815020114183426, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48126089572906494, "rewards/correct_reward_func/std": 0.18008500337600708, "step": 924 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2571.0, "completions/max_terminated_length": 2571.0, "completions/mean_length": 1518.702392578125, "completions/mean_terminated_length": 1518.702392578125, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 1.440809968847352, "grad_norm": 0.5867513418197632, "kl": 0.05604218691587448, "learning_rate": 1.4287499999999999e-06, "loss": 0.0145, "num_tokens": 120487994.0, "reward": 1.5328971147537231, "reward_std": 0.1053798571228981, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5567064881324768, "rewards/correct_reward_func/std": 0.18832466006278992, "step": 925 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 1588.9761962890625, "completions/mean_terminated_length": 1588.9761962890625, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 1.442367601246106, "grad_norm": 0.5647308826446533, "kl": 0.05172069929540157, "learning_rate": 1.428125e-06, "loss": -0.0063, "num_tokens": 120627498.0, "reward": 1.4882067441940308, "reward_std": 0.11672695726156235, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5120161771774292, "rewards/correct_reward_func/std": 0.15020842850208282, "step": 926 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3236.0, "completions/max_terminated_length": 3236.0, "completions/mean_length": 1654.166748046875, "completions/mean_terminated_length": 1654.166748046875, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 1.4439252336448598, "grad_norm": 0.5926505327224731, "kl": 0.053758930414915085, "learning_rate": 1.4274999999999999e-06, "loss": -0.04, "num_tokens": 120772460.0, "reward": 1.4601887464523315, "reward_std": 0.05557785928249359, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46018868684768677, "rewards/correct_reward_func/std": 0.13792595267295837, "step": 927 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 1570.511962890625, "completions/mean_terminated_length": 1490.73486328125, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 1.4454828660436136, "grad_norm": 0.575140118598938, "kl": 0.053668420761823654, "learning_rate": 1.4268749999999998e-06, "loss": 0.0534, "num_tokens": 120910263.0, "reward": 1.5464792251586914, "reward_std": 0.0644306093454361, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5464791655540466, "rewards/correct_reward_func/std": 0.1873585283756256, "step": 928 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2212.0, "completions/max_terminated_length": 2212.0, "completions/mean_length": 1485.0833740234375, "completions/mean_terminated_length": 1485.0833740234375, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 1.4470404984423677, "grad_norm": 0.59922194480896, "kl": 0.055644553154706955, "learning_rate": 1.42625e-06, "loss": 0.0191, "num_tokens": 121040902.0, "reward": 1.4886407852172852, "reward_std": 0.07724044471979141, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48864060640335083, "rewards/correct_reward_func/std": 0.13255321979522705, "step": 929 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2183.0, "completions/max_terminated_length": 2183.0, "completions/mean_length": 1533.5357666015625, "completions/mean_terminated_length": 1533.5357666015625, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 1.4485981308411215, "grad_norm": 0.6095460057258606, "kl": 0.05331596918404102, "learning_rate": 1.4256249999999998e-06, "loss": 0.0145, "num_tokens": 121175743.0, "reward": 1.4885571002960205, "reward_std": 0.09139028936624527, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.500461757183075, "rewards/correct_reward_func/std": 0.16977085173130035, "step": 930 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2338.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 1468.5714111328125, "completions/mean_terminated_length": 1468.5714111328125, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "epoch": 1.4501557632398754, "grad_norm": 0.5887149572372437, "kl": 0.05215633846819401, "learning_rate": 1.425e-06, "loss": 0.0072, "num_tokens": 121305121.0, "reward": 1.4931074380874634, "reward_std": 0.0646156594157219, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49310728907585144, "rewards/correct_reward_func/std": 0.15888220071792603, "step": 931 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3675.0, "completions/max_terminated_length": 3675.0, "completions/mean_length": 1578.547607421875, "completions/mean_terminated_length": 1578.547607421875, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 1.4517133956386292, "grad_norm": 0.6000562906265259, "kl": 0.05195810832083225, "learning_rate": 1.4243749999999998e-06, "loss": -0.0047, "num_tokens": 121443641.0, "reward": 1.5342808961868286, "reward_std": 0.06413520872592926, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5342807173728943, "rewards/correct_reward_func/std": 0.19110903143882751, "step": 932 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 1526.107177734375, "completions/mean_terminated_length": 1526.107177734375, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "epoch": 1.453271028037383, "grad_norm": 0.5677267909049988, "kl": 0.05291260778903961, "learning_rate": 1.42375e-06, "loss": -0.0034, "num_tokens": 121577720.0, "reward": 1.5239123106002808, "reward_std": 0.09007156640291214, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5358170866966248, "rewards/correct_reward_func/std": 0.1590527594089508, "step": 933 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1435.9405517578125, "completions/mean_terminated_length": 1435.9405517578125, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 1.4548286604361371, "grad_norm": 0.6042124629020691, "kl": 0.05365913547575474, "learning_rate": 1.4231249999999998e-06, "loss": 0.0113, "num_tokens": 121704381.0, "reward": 1.555336356163025, "reward_std": 0.05410221219062805, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5553362965583801, "rewards/correct_reward_func/std": 0.10760639607906342, "step": 934 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2496.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 1534.047607421875, "completions/mean_terminated_length": 1534.047607421875, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 1.456386292834891, "grad_norm": 0.5685712695121765, "kl": 0.05559377372264862, "learning_rate": 1.4225e-06, "loss": 0.0253, "num_tokens": 121839211.0, "reward": 1.4795795679092407, "reward_std": 0.03726006671786308, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4795795679092407, "rewards/correct_reward_func/std": 0.18165497481822968, "step": 935 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2233.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 1438.797607421875, "completions/mean_terminated_length": 1438.797607421875, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 1.4579439252336448, "grad_norm": 0.5869097709655762, "kl": 0.05177008546888828, "learning_rate": 1.4218749999999998e-06, "loss": -0.005, "num_tokens": 121965962.0, "reward": 1.511366605758667, "reward_std": 0.06467973440885544, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5113665461540222, "rewards/correct_reward_func/std": 0.18183663487434387, "step": 936 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2523.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 1514.84521484375, "completions/mean_terminated_length": 1514.84521484375, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 1.4595015576323989, "grad_norm": 0.5757589936256409, "kl": 0.051053477451205254, "learning_rate": 1.4212499999999997e-06, "loss": -0.0105, "num_tokens": 122099215.0, "reward": 1.4375081062316895, "reward_std": 0.05030220001935959, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4375079870223999, "rewards/correct_reward_func/std": 0.13149473071098328, "step": 937 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 1483.107177734375, "completions/mean_terminated_length": 1483.107177734375, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 1.4610591900311527, "grad_norm": 0.5888691544532776, "kl": 0.05225286819040775, "learning_rate": 1.420625e-06, "loss": -0.0105, "num_tokens": 122229796.0, "reward": 1.4999641180038452, "reward_std": 0.06860499083995819, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49996402859687805, "rewards/correct_reward_func/std": 0.16214340925216675, "step": 938 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 1482.452392578125, "completions/mean_terminated_length": 1482.452392578125, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 1.4626168224299065, "grad_norm": 0.6363764405250549, "kl": 0.0545808169990778, "learning_rate": 1.42e-06, "loss": 0.0186, "num_tokens": 122360232.0, "reward": 1.5112730264663696, "reward_std": 0.0804838314652443, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5231776237487793, "rewards/correct_reward_func/std": 0.18320703506469727, "step": 939 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2485.0, "completions/max_terminated_length": 2485.0, "completions/mean_length": 1502.0238037109375, "completions/mean_terminated_length": 1502.0238037109375, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 1.4641744548286604, "grad_norm": 0.5603182315826416, "kl": 0.05400681868195534, "learning_rate": 1.419375e-06, "loss": 0.0035, "num_tokens": 122492498.0, "reward": 1.4782599210739136, "reward_std": 0.07609423995018005, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.478259801864624, "rewards/correct_reward_func/std": 0.15181030333042145, "step": 940 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 1564.7261962890625, "completions/mean_terminated_length": 1484.8795166015625, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 1.4657320872274142, "grad_norm": 0.5541762709617615, "kl": 0.050283899530768394, "learning_rate": 1.41875e-06, "loss": 0.0585, "num_tokens": 122629887.0, "reward": 1.5310512781143188, "reward_std": 0.0820552334189415, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5310512781143188, "rewards/correct_reward_func/std": 0.17773263156414032, "step": 941 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2364.0, "completions/max_terminated_length": 2364.0, "completions/mean_length": 1451.8214111328125, "completions/mean_terminated_length": 1451.8214111328125, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 1.4672897196261683, "grad_norm": 0.5607936978340149, "kl": 0.05418185517191887, "learning_rate": 1.418125e-06, "loss": 0.0154, "num_tokens": 122757822.0, "reward": 1.4701182842254639, "reward_std": 0.05788838863372803, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47011828422546387, "rewards/correct_reward_func/std": 0.14018170535564423, "step": 942 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2311.0, "completions/mean_length": 1502.452392578125, "completions/mean_terminated_length": 1421.8553466796875, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 1.4688473520249221, "grad_norm": 0.5665018558502197, "kl": 0.05028504319489002, "learning_rate": 1.4175e-06, "loss": 0.0614, "num_tokens": 122890034.0, "reward": 1.403843879699707, "reward_std": 0.10934764891862869, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4157486855983734, "rewards/correct_reward_func/std": 0.17341257631778717, "step": 943 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2874.0, "completions/max_terminated_length": 2874.0, "completions/mean_length": 1478.166748046875, "completions/mean_terminated_length": 1478.166748046875, "completions/min_length": 596.0, "completions/min_terminated_length": 596.0, "epoch": 1.470404984423676, "grad_norm": 0.6497564911842346, "kl": 0.053252846002578735, "learning_rate": 1.416875e-06, "loss": 0.0594, "num_tokens": 123020122.0, "reward": 1.5276238918304443, "reward_std": 0.11952278763055801, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5514333844184875, "rewards/correct_reward_func/std": 0.21704232692718506, "step": 944 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3287.0, "completions/max_terminated_length": 3287.0, "completions/mean_length": 1385.607177734375, "completions/mean_terminated_length": 1385.607177734375, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 1.47196261682243, "grad_norm": 0.5961357951164246, "kl": 0.051925595849752426, "learning_rate": 1.41625e-06, "loss": 0.0416, "num_tokens": 123142231.0, "reward": 1.4902321100234985, "reward_std": 0.06453128904104233, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49023205041885376, "rewards/correct_reward_func/std": 0.12289825081825256, "step": 945 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 1389.1785888671875, "completions/mean_terminated_length": 1389.1785888671875, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 1.4735202492211839, "grad_norm": 0.6209313869476318, "kl": 0.05260413885116577, "learning_rate": 1.4156249999999999e-06, "loss": 0.0087, "num_tokens": 123264772.0, "reward": 1.4915246963500977, "reward_std": 0.05111170560121536, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4915246069431305, "rewards/correct_reward_func/std": 0.15616562962532043, "step": 946 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1447.5714111328125, "completions/mean_terminated_length": 1447.5714111328125, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 1.4750778816199377, "grad_norm": 0.5911281108856201, "kl": 0.05365084111690521, "learning_rate": 1.415e-06, "loss": 0.0091, "num_tokens": 123392350.0, "reward": 1.4764894247055054, "reward_std": 0.05792779102921486, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4764893054962158, "rewards/correct_reward_func/std": 0.12649469077587128, "step": 947 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 1444.607177734375, "completions/mean_terminated_length": 1444.607177734375, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 1.4766355140186915, "grad_norm": 0.5943292379379272, "kl": 0.05160903558135033, "learning_rate": 1.414375e-06, "loss": 0.0543, "num_tokens": 123519865.0, "reward": 1.4330207109451294, "reward_std": 0.05177720636129379, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4330206513404846, "rewards/correct_reward_func/std": 0.15861865878105164, "step": 948 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5153.0, "completions/max_terminated_length": 5153.0, "completions/mean_length": 1558.09521484375, "completions/mean_terminated_length": 1558.09521484375, "completions/min_length": 1072.0, "completions/min_terminated_length": 1072.0, "epoch": 1.4781931464174454, "grad_norm": 0.5562092661857605, "kl": 0.05123803950846195, "learning_rate": 1.41375e-06, "loss": 0.0114, "num_tokens": 123656865.0, "reward": 1.4860385656356812, "reward_std": 0.055783726274967194, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48603856563568115, "rewards/correct_reward_func/std": 0.15280146896839142, "step": 949 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 1441.9405517578125, "completions/mean_terminated_length": 1441.9405517578125, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 1.4797507788161994, "grad_norm": 0.5906851887702942, "kl": 0.05138607695698738, "learning_rate": 1.413125e-06, "loss": -0.005, "num_tokens": 123783850.0, "reward": 1.5210059881210327, "reward_std": 0.07296521216630936, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5210058093070984, "rewards/correct_reward_func/std": 0.1746196299791336, "step": 950 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3729.0, "completions/mean_length": 1601.5714111328125, "completions/mean_terminated_length": 1522.1685791015625, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 1.4813084112149533, "grad_norm": 0.5691021084785461, "kl": 0.04798812232911587, "learning_rate": 1.4125e-06, "loss": 0.0492, "num_tokens": 123924730.0, "reward": 1.4929431676864624, "reward_std": 0.08075027167797089, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4929429888725281, "rewards/correct_reward_func/std": 0.15799371898174286, "step": 951 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1801.0, "completions/max_terminated_length": 1801.0, "completions/mean_length": 1377.7738037109375, "completions/mean_terminated_length": 1377.7738037109375, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 1.482866043613707, "grad_norm": 0.6194444298744202, "kl": 0.05371929705142975, "learning_rate": 1.411875e-06, "loss": -0.0031, "num_tokens": 124046319.0, "reward": 1.4838625192642212, "reward_std": 0.053877513855695724, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4838624596595764, "rewards/correct_reward_func/std": 0.13156504929065704, "step": 952 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3592.0, "completions/max_terminated_length": 3592.0, "completions/mean_length": 1421.46435546875, "completions/mean_terminated_length": 1421.46435546875, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 1.4844236760124612, "grad_norm": 0.670741856098175, "kl": 0.05497931316494942, "learning_rate": 1.4112499999999998e-06, "loss": 0.0176, "num_tokens": 124171848.0, "reward": 1.4986613988876343, "reward_std": 0.11223272234201431, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.510565996170044, "rewards/correct_reward_func/std": 0.1498403400182724, "step": 953 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 1407.511962890625, "completions/mean_terminated_length": 1407.511962890625, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "epoch": 1.485981308411215, "grad_norm": 0.5721685290336609, "kl": 0.05319854058325291, "learning_rate": 1.410625e-06, "loss": 0.0179, "num_tokens": 124296175.0, "reward": 1.499523401260376, "reward_std": 0.09599613398313522, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5114279985427856, "rewards/correct_reward_func/std": 0.13554862141609192, "step": 954 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1365.202392578125, "completions/mean_terminated_length": 1365.202392578125, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 1.4875389408099688, "grad_norm": 0.5892857909202576, "kl": 0.05478361062705517, "learning_rate": 1.4099999999999998e-06, "loss": -0.0123, "num_tokens": 124416690.0, "reward": 1.564483880996704, "reward_std": 0.0664282962679863, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5644837617874146, "rewards/correct_reward_func/std": 0.12818284332752228, "step": 955 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1416.5833740234375, "completions/mean_terminated_length": 1334.9517822265625, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 1.4890965732087227, "grad_norm": 0.5662592649459839, "kl": 0.05166749097406864, "learning_rate": 1.409375e-06, "loss": 0.0648, "num_tokens": 124541617.0, "reward": 1.5131169557571411, "reward_std": 0.0802609845995903, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5131169557571411, "rewards/correct_reward_func/std": 0.20592641830444336, "step": 956 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1442.952392578125, "completions/mean_terminated_length": 1442.952392578125, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 1.4906542056074765, "grad_norm": 0.5630207657814026, "kl": 0.05279768630862236, "learning_rate": 1.4087499999999999e-06, "loss": -0.0013, "num_tokens": 124668777.0, "reward": 1.4913194179534912, "reward_std": 0.05650303512811661, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49131929874420166, "rewards/correct_reward_func/std": 0.10572747141122818, "step": 957 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 1531.7738037109375, "completions/mean_terminated_length": 1451.530029296875, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "epoch": 1.4922118380062306, "grad_norm": 0.5696709156036377, "kl": 0.050582244992256165, "learning_rate": 1.408125e-06, "loss": 0.0556, "num_tokens": 124803416.0, "reward": 1.4914538860321045, "reward_std": 0.06085503101348877, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49145376682281494, "rewards/correct_reward_func/std": 0.16253675520420074, "step": 958 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3247.0, "completions/max_terminated_length": 3247.0, "completions/mean_length": 1425.96435546875, "completions/mean_terminated_length": 1425.96435546875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 1.4937694704049844, "grad_norm": 0.5808737874031067, "kl": 0.051582057029008865, "learning_rate": 1.4074999999999999e-06, "loss": -0.0052, "num_tokens": 124929101.0, "reward": 1.5808985233306885, "reward_std": 0.09814611077308655, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5928031206130981, "rewards/correct_reward_func/std": 0.21265116333961487, "step": 959 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1995.0, "completions/max_terminated_length": 1995.0, "completions/mean_length": 1405.0, "completions/mean_terminated_length": 1405.0, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 1.4953271028037383, "grad_norm": 0.5805957913398743, "kl": 0.053646741434931755, "learning_rate": 1.406875e-06, "loss": -0.0369, "num_tokens": 125053175.0, "reward": 1.4618068933486938, "reward_std": 0.08247793465852737, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4618068039417267, "rewards/correct_reward_func/std": 0.14815668761730194, "step": 960 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2611.0, "completions/max_terminated_length": 2611.0, "completions/mean_length": 1456.166748046875, "completions/mean_terminated_length": 1456.166748046875, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 1.4968847352024923, "grad_norm": 0.5520665049552917, "kl": 0.05207379162311554, "learning_rate": 1.4062499999999999e-06, "loss": 0.0054, "num_tokens": 125181451.0, "reward": 1.564023494720459, "reward_std": 0.05855153501033783, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5640233755111694, "rewards/correct_reward_func/std": 0.11973454058170319, "step": 961 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1342.642822265625, "completions/mean_terminated_length": 1342.642822265625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 1.4984423676012462, "grad_norm": 0.5934251546859741, "kl": 0.053072670474648476, "learning_rate": 1.4056249999999998e-06, "loss": -0.0069, "num_tokens": 125300203.0, "reward": 1.5065466165542603, "reward_std": 0.08383716642856598, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5184512734413147, "rewards/correct_reward_func/std": 0.1316198706626892, "step": 962 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2003.0, "completions/max_terminated_length": 2003.0, "completions/mean_length": 1402.6905517578125, "completions/mean_terminated_length": 1402.6905517578125, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 1.5, "grad_norm": 0.6050531268119812, "kl": 0.05264459736645222, "learning_rate": 1.4049999999999999e-06, "loss": -0.0085, "num_tokens": 125424023.0, "reward": 1.5129127502441406, "reward_std": 0.0603819340467453, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5248174071311951, "rewards/correct_reward_func/std": 0.17035160958766937, "step": 963 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 1468.5, "completions/mean_terminated_length": 1387.493896484375, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 1.5015576323987538, "grad_norm": 0.6383330821990967, "kl": 0.05227588675916195, "learning_rate": 1.4043749999999998e-06, "loss": 0.0731, "num_tokens": 125553347.0, "reward": 1.439063549041748, "reward_std": 0.06335154920816422, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43906348943710327, "rewards/correct_reward_func/std": 0.11656816303730011, "step": 964 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2419.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 1408.7381591796875, "completions/mean_terminated_length": 1408.7381591796875, "completions/min_length": 478.0, "completions/min_terminated_length": 478.0, "epoch": 1.5031152647975077, "grad_norm": 0.6321585178375244, "kl": 0.05359589867293835, "learning_rate": 1.40375e-06, "loss": 0.0356, "num_tokens": 125677627.0, "reward": 1.4856932163238525, "reward_std": 0.06343919783830643, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4856932461261749, "rewards/correct_reward_func/std": 0.17424112558364868, "step": 965 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2172.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 1406.65478515625, "completions/mean_terminated_length": 1406.65478515625, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 1.5046728971962615, "grad_norm": 0.5907049179077148, "kl": 0.05286192148923874, "learning_rate": 1.4031249999999998e-06, "loss": 0.0145, "num_tokens": 125801564.0, "reward": 1.5403074026107788, "reward_std": 0.06774850189685822, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.540307343006134, "rewards/correct_reward_func/std": 0.14287278056144714, "step": 966 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1370.3809814453125, "completions/mean_terminated_length": 1370.3809814453125, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 1.5062305295950156, "grad_norm": 0.5974894762039185, "kl": 0.05525593459606171, "learning_rate": 1.4025e-06, "loss": -0.0099, "num_tokens": 125922418.0, "reward": 1.4362608194351196, "reward_std": 0.09759613871574402, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.46007010340690613, "rewards/correct_reward_func/std": 0.12809668481349945, "step": 967 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 1489.952392578125, "completions/mean_terminated_length": 1489.952392578125, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "epoch": 1.5077881619937694, "grad_norm": 0.5633293390274048, "kl": 0.0533794816583395, "learning_rate": 1.401875e-06, "loss": -0.0011, "num_tokens": 126053604.0, "reward": 1.5027353763580322, "reward_std": 0.0703050047159195, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.502735435962677, "rewards/correct_reward_func/std": 0.17239470779895782, "step": 968 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2882.0, "completions/max_terminated_length": 2882.0, "completions/mean_length": 1459.3690185546875, "completions/mean_terminated_length": 1459.3690185546875, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 1.5093457943925235, "grad_norm": 0.561694324016571, "kl": 0.05276419036090374, "learning_rate": 1.4012500000000001e-06, "loss": 0.0087, "num_tokens": 126182089.0, "reward": 1.5203028917312622, "reward_std": 0.046117544174194336, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5203028917312622, "rewards/correct_reward_func/std": 0.1272481381893158, "step": 969 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 1415.1785888671875, "completions/mean_terminated_length": 1415.1785888671875, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 1.5109034267912773, "grad_norm": 0.5880964994430542, "kl": 0.05588230863213539, "learning_rate": 1.400625e-06, "loss": -0.0095, "num_tokens": 126306718.0, "reward": 1.5234330892562866, "reward_std": 0.07166480273008347, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5234330296516418, "rewards/correct_reward_func/std": 0.1392935812473297, "step": 970 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2642.0, "completions/max_terminated_length": 2642.0, "completions/mean_length": 1410.1309814453125, "completions/mean_terminated_length": 1410.1309814453125, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 1.5124610591900312, "grad_norm": 0.6043219566345215, "kl": 0.053293656557798386, "learning_rate": 1.4e-06, "loss": 0.0179, "num_tokens": 126431079.0, "reward": 1.4949477910995483, "reward_std": 0.05433054268360138, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49494776129722595, "rewards/correct_reward_func/std": 0.12832768261432648, "step": 971 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 1419.5595703125, "completions/mean_terminated_length": 1419.5595703125, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 1.514018691588785, "grad_norm": 0.5639768242835999, "kl": 0.054521141573786736, "learning_rate": 1.399375e-06, "loss": -0.0118, "num_tokens": 126556394.0, "reward": 1.5019606351852417, "reward_std": 0.05233834683895111, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5019605755805969, "rewards/correct_reward_func/std": 0.15723295509815216, "step": 972 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1480.5833740234375, "completions/mean_terminated_length": 1480.5833740234375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 1.5155763239875388, "grad_norm": 0.6093022227287292, "kl": 0.05375696159899235, "learning_rate": 1.39875e-06, "loss": -0.0032, "num_tokens": 126686907.0, "reward": 1.5576914548873901, "reward_std": 0.05558937042951584, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5576913952827454, "rewards/correct_reward_func/std": 0.13169939815998077, "step": 973 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1508.1785888671875, "completions/mean_terminated_length": 1508.1785888671875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "epoch": 1.5171339563862927, "grad_norm": 0.62723708152771, "kl": 0.05615209974348545, "learning_rate": 1.398125e-06, "loss": -0.016, "num_tokens": 126819438.0, "reward": 1.485601544380188, "reward_std": 0.057860955595970154, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48560142517089844, "rewards/correct_reward_func/std": 0.1382783055305481, "step": 974 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1959.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1415.5, "completions/mean_terminated_length": 1415.5, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 1.5186915887850467, "grad_norm": 0.6126437783241272, "kl": 0.05616501159965992, "learning_rate": 1.3975e-06, "loss": 0.0033, "num_tokens": 126944358.0, "reward": 1.5345946550369263, "reward_std": 0.05855342373251915, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5345944762229919, "rewards/correct_reward_func/std": 0.14352357387542725, "step": 975 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2133.0, "completions/max_terminated_length": 2133.0, "completions/mean_length": 1483.916748046875, "completions/mean_terminated_length": 1483.916748046875, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "epoch": 1.5202492211838006, "grad_norm": 0.582556426525116, "kl": 0.05510487221181393, "learning_rate": 1.396875e-06, "loss": -0.0272, "num_tokens": 127075007.0, "reward": 1.5114144086837769, "reward_std": 0.058944329619407654, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5114142298698425, "rewards/correct_reward_func/std": 0.16233842074871063, "step": 976 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 1534.1309814453125, "completions/mean_terminated_length": 1534.1309814453125, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "epoch": 1.5218068535825546, "grad_norm": 0.592313289642334, "kl": 0.056483207270503044, "learning_rate": 1.39625e-06, "loss": -0.0018, "num_tokens": 127209838.0, "reward": 1.5234310626983643, "reward_std": 0.09745682030916214, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5353357195854187, "rewards/correct_reward_func/std": 0.1777001917362213, "step": 977 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2083.0, "completions/max_terminated_length": 2083.0, "completions/mean_length": 1463.5238037109375, "completions/mean_terminated_length": 1463.5238037109375, "completions/min_length": 925.0, "completions/min_terminated_length": 925.0, "epoch": 1.5233644859813085, "grad_norm": 0.5606083869934082, "kl": 0.05754604563117027, "learning_rate": 1.3956249999999999e-06, "loss": -0.0177, "num_tokens": 127338654.0, "reward": 1.5304343700408936, "reward_std": 0.05394675210118294, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5304343104362488, "rewards/correct_reward_func/std": 0.1772596538066864, "step": 978 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2092.0, "completions/mean_length": 1560.46435546875, "completions/mean_terminated_length": 1480.566162109375, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 1.5249221183800623, "grad_norm": 0.5772314071655273, "kl": 0.05454133078455925, "learning_rate": 1.395e-06, "loss": -0.0027, "num_tokens": 127475607.0, "reward": 1.4887261390686035, "reward_std": 0.12839165329933167, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.5244404077529907, "rewards/correct_reward_func/std": 0.1708691418170929, "step": 979 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 1508.666748046875, "completions/mean_terminated_length": 1508.666748046875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 1.5264797507788161, "grad_norm": 0.5475894212722778, "kl": 0.05505007319152355, "learning_rate": 1.3943749999999999e-06, "loss": 0.0, "num_tokens": 127608431.0, "reward": 1.4789574146270752, "reward_std": 0.061023686081171036, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4789574146270752, "rewards/correct_reward_func/std": 0.10683758556842804, "step": 980 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 1466.047607421875, "completions/mean_terminated_length": 1466.047607421875, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 1.52803738317757, "grad_norm": 0.593788206577301, "kl": 0.055405523627996445, "learning_rate": 1.39375e-06, "loss": 0.0461, "num_tokens": 127737561.0, "reward": 1.4527305364608765, "reward_std": 0.06627821177244186, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46463513374328613, "rewards/correct_reward_func/std": 0.10521090030670166, "step": 981 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 1594.4881591796875, "completions/mean_terminated_length": 1515.0, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 1.5295950155763238, "grad_norm": 0.5138880014419556, "kl": 0.053472984582185745, "learning_rate": 1.393125e-06, "loss": 0.0527, "num_tokens": 127877624.0, "reward": 1.5147207975387573, "reward_std": 0.0527060441672802, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.514720618724823, "rewards/correct_reward_func/std": 0.19597549736499786, "step": 982 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1944.0, "completions/max_terminated_length": 1944.0, "completions/mean_length": 1432.09521484375, "completions/mean_terminated_length": 1432.09521484375, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 1.5311526479750779, "grad_norm": 2.669544219970703, "kl": 0.10140970535576344, "learning_rate": 1.3925e-06, "loss": -0.0186, "num_tokens": 128004064.0, "reward": 1.4886975288391113, "reward_std": 0.09048821777105331, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5006023645401001, "rewards/correct_reward_func/std": 0.16248732805252075, "step": 983 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2243.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 1483.3690185546875, "completions/mean_terminated_length": 1483.3690185546875, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 1.5327102803738317, "grad_norm": 0.5933528542518616, "kl": 0.055261287838220596, "learning_rate": 1.391875e-06, "loss": 0.0152, "num_tokens": 128134595.0, "reward": 1.4884581565856934, "reward_std": 0.04756522923707962, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4884580373764038, "rewards/correct_reward_func/std": 0.1384175568819046, "step": 984 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 1441.3929443359375, "completions/mean_terminated_length": 1441.3929443359375, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 1.5342679127725858, "grad_norm": 0.5908680558204651, "kl": 0.05583428591489792, "learning_rate": 1.39125e-06, "loss": -0.025, "num_tokens": 128261546.0, "reward": 1.5170215368270874, "reward_std": 0.05428963899612427, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5170214176177979, "rewards/correct_reward_func/std": 0.1315389722585678, "step": 985 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1482.5714111328125, "completions/mean_terminated_length": 1482.5714111328125, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 1.5358255451713396, "grad_norm": 0.6321178078651428, "kl": 0.05928328447043896, "learning_rate": 1.390625e-06, "loss": 0.0214, "num_tokens": 128392154.0, "reward": 1.5175918340682983, "reward_std": 0.05746595188975334, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5175917744636536, "rewards/correct_reward_func/std": 0.15857887268066406, "step": 986 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 1456.4285888671875, "completions/mean_terminated_length": 1456.4285888671875, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 1.5373831775700935, "grad_norm": 0.5870681405067444, "kl": 0.05598471499979496, "learning_rate": 1.3899999999999998e-06, "loss": 0.0314, "num_tokens": 128520566.0, "reward": 1.5149093866348267, "reward_std": 0.04380248114466667, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5149092674255371, "rewards/correct_reward_func/std": 0.14865918457508087, "step": 987 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 1492.047607421875, "completions/mean_terminated_length": 1492.047607421875, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 1.5389408099688473, "grad_norm": 0.6118965148925781, "kl": 0.056141821667551994, "learning_rate": 1.389375e-06, "loss": 0.0201, "num_tokens": 128651760.0, "reward": 1.5297244787216187, "reward_std": 0.04985615238547325, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5297244787216187, "rewards/correct_reward_func/std": 0.12817271053791046, "step": 988 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 1529.09521484375, "completions/mean_terminated_length": 1529.09521484375, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 1.5404984423676011, "grad_norm": 0.5834553241729736, "kl": 0.05576501786708832, "learning_rate": 1.3887499999999998e-06, "loss": -0.0064, "num_tokens": 128786180.0, "reward": 1.4784826040267944, "reward_std": 0.056521281599998474, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47848257422447205, "rewards/correct_reward_func/std": 0.17981821298599243, "step": 989 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2528.0, "completions/mean_length": 1558.1905517578125, "completions/mean_terminated_length": 1478.2650146484375, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 1.542056074766355, "grad_norm": 0.5497155785560608, "kl": 0.05536571890115738, "learning_rate": 1.388125e-06, "loss": 0.0521, "num_tokens": 128922936.0, "reward": 1.5244368314743042, "reward_std": 0.036744069308042526, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5244366526603699, "rewards/correct_reward_func/std": 0.1845894306898117, "step": 990 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 1546.202392578125, "completions/mean_terminated_length": 1546.202392578125, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 1.543613707165109, "grad_norm": 0.550896167755127, "kl": 0.058403681963682175, "learning_rate": 1.3874999999999998e-06, "loss": 0.0058, "num_tokens": 129058733.0, "reward": 1.5245283842086792, "reward_std": 0.09339477121829987, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5364329814910889, "rewards/correct_reward_func/std": 0.1055811196565628, "step": 991 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 1498.2857666015625, "completions/mean_terminated_length": 1498.2857666015625, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "epoch": 1.5451713395638629, "grad_norm": 0.6179094314575195, "kl": 0.05653979256749153, "learning_rate": 1.386875e-06, "loss": 0.0177, "num_tokens": 129190469.0, "reward": 1.5216782093048096, "reward_std": 0.05781848728656769, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5216782093048096, "rewards/correct_reward_func/std": 0.14361537992954254, "step": 992 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 1471.761962890625, "completions/mean_terminated_length": 1471.761962890625, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 1.546728971962617, "grad_norm": 0.6041738986968994, "kl": 0.055141156539320946, "learning_rate": 1.3862499999999999e-06, "loss": -0.014, "num_tokens": 129320079.0, "reward": 1.5567200183868408, "reward_std": 0.053502753376960754, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.556719958782196, "rewards/correct_reward_func/std": 0.18537770211696625, "step": 993 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 1541.607177734375, "completions/mean_terminated_length": 1541.607177734375, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 1.5482866043613708, "grad_norm": 0.5438805222511292, "kl": 0.05603756755590439, "learning_rate": 1.385625e-06, "loss": 0.0134, "num_tokens": 129455634.0, "reward": 1.5354129076004028, "reward_std": 0.0948275551199913, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5473176836967468, "rewards/correct_reward_func/std": 0.15340009331703186, "step": 994 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1527.7261962890625, "completions/mean_terminated_length": 1527.7261962890625, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "epoch": 1.5498442367601246, "grad_norm": 0.5533144474029541, "kl": 0.059015773236751556, "learning_rate": 1.3849999999999999e-06, "loss": 0.0131, "num_tokens": 129589861.0, "reward": 1.4884942770004272, "reward_std": 0.09217140078544617, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5003989338874817, "rewards/correct_reward_func/std": 0.15462777018547058, "step": 995 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 1493.6785888671875, "completions/mean_terminated_length": 1493.6785888671875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 1.5514018691588785, "grad_norm": 0.5743140578269958, "kl": 0.05605132691562176, "learning_rate": 1.3843749999999998e-06, "loss": -0.0153, "num_tokens": 129721270.0, "reward": 1.5518122911453247, "reward_std": 0.05846726894378662, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5518122911453247, "rewards/correct_reward_func/std": 0.1423969566822052, "step": 996 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1481.202392578125, "completions/mean_terminated_length": 1481.202392578125, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 1.5529595015576323, "grad_norm": 0.610561728477478, "kl": 0.058797696605324745, "learning_rate": 1.3837499999999999e-06, "loss": -0.0169, "num_tokens": 129851649.0, "reward": 1.5218530893325806, "reward_std": 0.06818825751543045, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5337576866149902, "rewards/correct_reward_func/std": 0.12073443830013275, "step": 997 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 1468.261962890625, "completions/mean_terminated_length": 1468.261962890625, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 1.5545171339563861, "grad_norm": 0.6080528497695923, "kl": 0.059630218893289566, "learning_rate": 1.3831249999999998e-06, "loss": 0.0039, "num_tokens": 129981037.0, "reward": 1.4734840393066406, "reward_std": 0.07547850161790848, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4853888154029846, "rewards/correct_reward_func/std": 0.13890855014324188, "step": 998 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 1448.4405517578125, "completions/mean_terminated_length": 1448.4405517578125, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "epoch": 1.5560747663551402, "grad_norm": 0.557486891746521, "kl": 0.058954037725925446, "learning_rate": 1.3825e-06, "loss": -0.0353, "num_tokens": 130108586.0, "reward": 1.6090483665466309, "reward_std": 0.07873685657978058, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.6090484261512756, "rewards/correct_reward_func/std": 0.14213018119335175, "step": 999 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2257.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 1544.6785888671875, "completions/mean_terminated_length": 1544.6785888671875, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 1.557632398753894, "grad_norm": 0.594116747379303, "kl": 0.0562005378305912, "learning_rate": 1.381875e-06, "loss": 0.0047, "num_tokens": 130244213.0, "reward": 1.5270923376083374, "reward_std": 0.07249116897583008, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5389969944953918, "rewards/correct_reward_func/std": 0.18675528466701508, "step": 1000 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2879.0, "completions/max_terminated_length": 2879.0, "completions/mean_length": 1532.6905517578125, "completions/mean_terminated_length": 1532.6905517578125, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 1.559190031152648, "grad_norm": 0.6067295670509338, "kl": 0.05792676471173763, "learning_rate": 1.3812500000000001e-06, "loss": -0.0092, "num_tokens": 130379007.0, "reward": 1.4932868480682373, "reward_std": 0.050280094146728516, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49328672885894775, "rewards/correct_reward_func/std": 0.12472639977931976, "step": 1001 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 1516.3809814453125, "completions/mean_terminated_length": 1516.3809814453125, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 1.560747663551402, "grad_norm": 0.6010133624076843, "kl": 0.05928630381822586, "learning_rate": 1.380625e-06, "loss": 0.0242, "num_tokens": 130512293.0, "reward": 1.516017198562622, "reward_std": 0.10517614334821701, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5279219746589661, "rewards/correct_reward_func/std": 0.15172363817691803, "step": 1002 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 1554.761962890625, "completions/mean_terminated_length": 1554.761962890625, "completions/min_length": 1019.0, "completions/min_terminated_length": 1019.0, "epoch": 1.5623052959501558, "grad_norm": 0.5720633864402771, "kl": 0.060122422873973846, "learning_rate": 1.38e-06, "loss": 0.0131, "num_tokens": 130648959.0, "reward": 1.4512741565704346, "reward_std": 0.045847050845623016, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45127415657043457, "rewards/correct_reward_func/std": 0.14463678002357483, "step": 1003 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2219.0, "completions/max_terminated_length": 2219.0, "completions/mean_length": 1560.0833740234375, "completions/mean_terminated_length": 1560.0833740234375, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 1.5638629283489096, "grad_norm": 0.5987496376037598, "kl": 0.055827124044299126, "learning_rate": 1.379375e-06, "loss": 0.005, "num_tokens": 130786078.0, "reward": 1.4668896198272705, "reward_std": 0.041976310312747955, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4668896496295929, "rewards/correct_reward_func/std": 0.09860485792160034, "step": 1004 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2589.0, "completions/max_terminated_length": 2589.0, "completions/mean_length": 1553.5833740234375, "completions/mean_terminated_length": 1553.5833740234375, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 1.5654205607476634, "grad_norm": 0.5509441494941711, "kl": 0.05763794295489788, "learning_rate": 1.37875e-06, "loss": -0.0053, "num_tokens": 130922519.0, "reward": 1.4821547269821167, "reward_std": 0.04169899597764015, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48215457797050476, "rewards/correct_reward_func/std": 0.17924876511096954, "step": 1005 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 1510.857177734375, "completions/mean_terminated_length": 1510.857177734375, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "epoch": 1.5669781931464173, "grad_norm": 0.5987504720687866, "kl": 0.05725214071571827, "learning_rate": 1.378125e-06, "loss": 0.0309, "num_tokens": 131055323.0, "reward": 1.4259028434753418, "reward_std": 0.13602343201637268, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.46161696314811707, "rewards/correct_reward_func/std": 0.1658160239458084, "step": 1006 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2223.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 1545.357177734375, "completions/mean_terminated_length": 1545.357177734375, "completions/min_length": 983.0, "completions/min_terminated_length": 983.0, "epoch": 1.5685358255451713, "grad_norm": 0.5653539299964905, "kl": 0.056894658133387566, "learning_rate": 1.3775e-06, "loss": -0.021, "num_tokens": 131191073.0, "reward": 1.4935802221298218, "reward_std": 0.07257944345474243, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5054848194122314, "rewards/correct_reward_func/std": 0.15568366646766663, "step": 1007 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 1608.702392578125, "completions/mean_terminated_length": 1529.385498046875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 1.5700934579439252, "grad_norm": 0.5624326467514038, "kl": 0.055087700486183167, "learning_rate": 1.376875e-06, "loss": 0.0575, "num_tokens": 131332282.0, "reward": 1.5021895170211792, "reward_std": 0.07486995309591293, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5021894574165344, "rewards/correct_reward_func/std": 0.1269064098596573, "step": 1008 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 1530.0, "completions/mean_terminated_length": 1530.0, "completions/min_length": 1019.0, "completions/min_terminated_length": 1019.0, "epoch": 1.5716510903426792, "grad_norm": 0.6044917106628418, "kl": 0.056555962190032005, "learning_rate": 1.37625e-06, "loss": 0.014, "num_tokens": 131466610.0, "reward": 1.5315054655075073, "reward_std": 0.05434272065758705, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5315054655075073, "rewards/correct_reward_func/std": 0.14260385930538177, "step": 1009 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 1527.3809814453125, "completions/mean_terminated_length": 1527.3809814453125, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 1.573208722741433, "grad_norm": 0.5926375389099121, "kl": 0.05986838415265083, "learning_rate": 1.375625e-06, "loss": 0.0065, "num_tokens": 131600940.0, "reward": 1.4688937664031982, "reward_std": 0.09081895649433136, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4807983636856079, "rewards/correct_reward_func/std": 0.15290535986423492, "step": 1010 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2257.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 1533.0714111328125, "completions/mean_terminated_length": 1533.0714111328125, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 1.574766355140187, "grad_norm": 0.5784596800804138, "kl": 0.05727861449122429, "learning_rate": 1.375e-06, "loss": -0.0163, "num_tokens": 131735778.0, "reward": 1.4873205423355103, "reward_std": 0.05376929044723511, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4873204827308655, "rewards/correct_reward_func/std": 0.1634834259748459, "step": 1011 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 1540.047607421875, "completions/mean_terminated_length": 1540.047607421875, "completions/min_length": 1064.0, "completions/min_terminated_length": 1064.0, "epoch": 1.5763239875389408, "grad_norm": 0.652586817741394, "kl": 0.05992012470960617, "learning_rate": 1.3743749999999999e-06, "loss": 0.0102, "num_tokens": 131871196.0, "reward": 1.4239557981491089, "reward_std": 0.08067857474088669, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43586039543151855, "rewards/correct_reward_func/std": 0.14762204885482788, "step": 1012 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1488.4761962890625, "completions/mean_terminated_length": 1488.4761962890625, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 1.5778816199376946, "grad_norm": 0.5661947727203369, "kl": 0.05911418795585632, "learning_rate": 1.37375e-06, "loss": 0.0158, "num_tokens": 132002162.0, "reward": 1.525542140007019, "reward_std": 0.06704176217317581, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5255421996116638, "rewards/correct_reward_func/std": 0.13780654966831207, "step": 1013 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 1525.416748046875, "completions/mean_terminated_length": 1525.416748046875, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "epoch": 1.5794392523364484, "grad_norm": 0.6187487840652466, "kl": 0.05780523456633091, "learning_rate": 1.3731249999999999e-06, "loss": 0.0074, "num_tokens": 132136339.0, "reward": 1.526188611984253, "reward_std": 0.05230359733104706, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5261886119842529, "rewards/correct_reward_func/std": 0.18830551207065582, "step": 1014 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 1456.6905517578125, "completions/mean_terminated_length": 1456.6905517578125, "completions/min_length": 1014.0, "completions/min_terminated_length": 1014.0, "epoch": 1.5809968847352025, "grad_norm": 0.6542757153511047, "kl": 0.0579967275261879, "learning_rate": 1.3725e-06, "loss": 0.0149, "num_tokens": 132264659.0, "reward": 1.5420114994049072, "reward_std": 0.044709715992212296, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5420114994049072, "rewards/correct_reward_func/std": 0.190092533826828, "step": 1015 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1959.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1480.15478515625, "completions/mean_terminated_length": 1480.15478515625, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "epoch": 1.5825545171339563, "grad_norm": 0.6099100112915039, "kl": 0.054762642830610275, "learning_rate": 1.3718749999999999e-06, "loss": -0.0104, "num_tokens": 132395142.0, "reward": 1.4359278678894043, "reward_std": 0.08171571046113968, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4478323757648468, "rewards/correct_reward_func/std": 0.14784473180770874, "step": 1016 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1411.0238037109375, "completions/mean_terminated_length": 1411.0238037109375, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 1.5841121495327104, "grad_norm": 0.6238840818405151, "kl": 0.058730076998472214, "learning_rate": 1.37125e-06, "loss": -0.0187, "num_tokens": 132519596.0, "reward": 1.462868094444275, "reward_std": 0.042033370584249496, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46286800503730774, "rewards/correct_reward_func/std": 0.1607520580291748, "step": 1017 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1519.59521484375, "completions/mean_terminated_length": 1439.2047119140625, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 1.5856697819314642, "grad_norm": 0.5553354024887085, "kl": 0.05277878977358341, "learning_rate": 1.370625e-06, "loss": 0.047, "num_tokens": 132653176.0, "reward": 1.5335984230041504, "reward_std": 0.07575073838233948, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5335984230041504, "rewards/correct_reward_func/std": 0.13631217181682587, "step": 1018 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2355.0, "completions/max_terminated_length": 2355.0, "completions/mean_length": 1459.96435546875, "completions/mean_terminated_length": 1459.96435546875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 1.587227414330218, "grad_norm": 0.6125195622444153, "kl": 0.055847397074103355, "learning_rate": 1.37e-06, "loss": -0.0113, "num_tokens": 132781771.0, "reward": 1.4723474979400635, "reward_std": 0.05186872184276581, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4723474085330963, "rewards/correct_reward_func/std": 0.13183002173900604, "step": 1019 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1417.0595703125, "completions/mean_terminated_length": 1417.0595703125, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 1.588785046728972, "grad_norm": 0.6113051176071167, "kl": 0.0562934186309576, "learning_rate": 1.369375e-06, "loss": 0.002, "num_tokens": 132906858.0, "reward": 1.512320876121521, "reward_std": 0.049453962594270706, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5123208165168762, "rewards/correct_reward_func/std": 0.1713578850030899, "step": 1020 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7480.0, "completions/max_terminated_length": 7480.0, "completions/mean_length": 1510.8690185546875, "completions/mean_terminated_length": 1510.8690185546875, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 1.5903426791277258, "grad_norm": 0.5587428212165833, "kl": 0.05253695324063301, "learning_rate": 1.3687499999999998e-06, "loss": -0.0235, "num_tokens": 133039669.0, "reward": 1.599755048751831, "reward_std": 0.05422939732670784, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5997551083564758, "rewards/correct_reward_func/std": 0.19660888612270355, "step": 1021 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3498.0, "completions/max_terminated_length": 3498.0, "completions/mean_length": 1420.2261962890625, "completions/mean_terminated_length": 1420.2261962890625, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 1.5919003115264796, "grad_norm": 0.6202037334442139, "kl": 0.05383214168250561, "learning_rate": 1.368125e-06, "loss": -0.0053, "num_tokens": 133164914.0, "reward": 1.5503116846084595, "reward_std": 0.0778794214129448, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5622163414955139, "rewards/correct_reward_func/std": 0.14359301328659058, "step": 1022 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2090.0, "completions/max_terminated_length": 2090.0, "completions/mean_length": 1443.261962890625, "completions/mean_terminated_length": 1443.261962890625, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 1.5934579439252337, "grad_norm": 0.5619306564331055, "kl": 0.05369856022298336, "learning_rate": 1.3674999999999998e-06, "loss": 0.016, "num_tokens": 133292262.0, "reward": 1.529478907585144, "reward_std": 0.09542658179998398, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5413835644721985, "rewards/correct_reward_func/std": 0.17236556112766266, "step": 1023 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1391.15478515625, "completions/mean_terminated_length": 1391.15478515625, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 1.5950155763239875, "grad_norm": 0.624824583530426, "kl": 0.05782540142536163, "learning_rate": 1.366875e-06, "loss": -0.0107, "num_tokens": 133414915.0, "reward": 1.4799925088882446, "reward_std": 0.0581979975104332, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47999244928359985, "rewards/correct_reward_func/std": 0.13127374649047852, "step": 1024 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2126.0, "completions/max_terminated_length": 2126.0, "completions/mean_length": 1430.916748046875, "completions/mean_terminated_length": 1430.916748046875, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 1.5965732087227416, "grad_norm": 0.6070372462272644, "kl": 0.05460609495639801, "learning_rate": 1.3662499999999998e-06, "loss": -0.002, "num_tokens": 133541334.0, "reward": 1.456343412399292, "reward_std": 0.061086416244506836, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45634332299232483, "rewards/correct_reward_func/std": 0.11224711686372757, "step": 1025 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2177.0, "completions/max_terminated_length": 2177.0, "completions/mean_length": 1420.047607421875, "completions/mean_terminated_length": 1420.047607421875, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 1.5981308411214954, "grad_norm": 0.5938939452171326, "kl": 0.05456771142780781, "learning_rate": 1.365625e-06, "loss": -0.0131, "num_tokens": 133666672.0, "reward": 1.5658003091812134, "reward_std": 0.07403706759214401, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5658001899719238, "rewards/correct_reward_func/std": 0.11373692005872726, "step": 1026 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1965.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1408.702392578125, "completions/mean_terminated_length": 1408.702392578125, "completions/min_length": 1006.0, "completions/min_terminated_length": 1006.0, "epoch": 1.5996884735202492, "grad_norm": 0.5788083076477051, "kl": 0.05543653294444084, "learning_rate": 1.3649999999999998e-06, "loss": 0.0077, "num_tokens": 133790973.0, "reward": 1.4734599590301514, "reward_std": 0.0897451639175415, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4853646457195282, "rewards/correct_reward_func/std": 0.1481408178806305, "step": 1027 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 1497.09521484375, "completions/mean_terminated_length": 1497.09521484375, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "epoch": 1.601246105919003, "grad_norm": 0.5803978443145752, "kl": 0.05440870113670826, "learning_rate": 1.3643749999999997e-06, "loss": 0.0146, "num_tokens": 133922681.0, "reward": 1.5256907939910889, "reward_std": 0.0636632889509201, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5256906151771545, "rewards/correct_reward_func/std": 0.16266074776649475, "step": 1028 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 1530.40478515625, "completions/mean_terminated_length": 1450.14453125, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 1.602803738317757, "grad_norm": 0.5963298082351685, "kl": 0.05118001997470856, "learning_rate": 1.3637499999999999e-06, "loss": 0.0616, "num_tokens": 134057181.0, "reward": 1.521901249885559, "reward_std": 0.06911131739616394, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5219012498855591, "rewards/correct_reward_func/std": 0.14165343344211578, "step": 1029 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 1505.8333740234375, "completions/mean_terminated_length": 1505.8333740234375, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 1.6043613707165107, "grad_norm": 0.5990629196166992, "kl": 0.054124271497130394, "learning_rate": 1.3631249999999998e-06, "loss": -0.0114, "num_tokens": 134189629.0, "reward": 1.4873378276824951, "reward_std": 0.06727422028779984, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48733773827552795, "rewards/correct_reward_func/std": 0.13689805567264557, "step": 1030 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1450.3690185546875, "completions/mean_terminated_length": 1450.3690185546875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 1.6059190031152648, "grad_norm": 0.5990805625915527, "kl": 0.05593542754650116, "learning_rate": 1.3625e-06, "loss": -0.0076, "num_tokens": 134317556.0, "reward": 1.4957433938980103, "reward_std": 0.0669202134013176, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4957433044910431, "rewards/correct_reward_func/std": 0.17975614964962006, "step": 1031 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2742.0, "completions/max_terminated_length": 2742.0, "completions/mean_length": 1469.7261962890625, "completions/mean_terminated_length": 1469.7261962890625, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 1.6074766355140186, "grad_norm": 0.5659118890762329, "kl": 0.055733053013682365, "learning_rate": 1.361875e-06, "loss": 0.0027, "num_tokens": 134447007.0, "reward": 1.5046303272247314, "reward_std": 0.06078405678272247, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5046302676200867, "rewards/correct_reward_func/std": 0.15718205273151398, "step": 1032 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1441.3214111328125, "completions/mean_terminated_length": 1441.3214111328125, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 1.6090342679127727, "grad_norm": 0.6231014728546143, "kl": 0.05473615229129791, "learning_rate": 1.36125e-06, "loss": 0.0129, "num_tokens": 134573994.0, "reward": 1.5341296195983887, "reward_std": 0.06255558878183365, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5341295599937439, "rewards/correct_reward_func/std": 0.21032603085041046, "step": 1033 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3073.0, "completions/max_terminated_length": 3073.0, "completions/mean_length": 1508.5357666015625, "completions/mean_terminated_length": 1508.5357666015625, "completions/min_length": 999.0, "completions/min_terminated_length": 999.0, "epoch": 1.6105919003115265, "grad_norm": 0.5707244873046875, "kl": 0.05204140394926071, "learning_rate": 1.360625e-06, "loss": -0.0173, "num_tokens": 134706765.0, "reward": 1.4597147703170776, "reward_std": 0.09199413657188416, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4716194272041321, "rewards/correct_reward_func/std": 0.13465441763401031, "step": 1034 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 1537.0238037109375, "completions/mean_terminated_length": 1456.84326171875, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 1.6121495327102804, "grad_norm": 0.554625391960144, "kl": 0.05230444855988026, "learning_rate": 1.3600000000000001e-06, "loss": 0.0366, "num_tokens": 134841917.0, "reward": 1.4520354270935059, "reward_std": 0.06311675906181335, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4520353078842163, "rewards/correct_reward_func/std": 0.1268446296453476, "step": 1035 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1457.7857666015625, "completions/mean_terminated_length": 1457.7857666015625, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "epoch": 1.6137071651090342, "grad_norm": 0.5534580945968628, "kl": 0.050926633179187775, "learning_rate": 1.359375e-06, "loss": -0.0421, "num_tokens": 134970371.0, "reward": 1.509501576423645, "reward_std": 0.06214066967368126, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5095014572143555, "rewards/correct_reward_func/std": 0.16853904724121094, "step": 1036 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2440.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 1481.9761962890625, "completions/mean_terminated_length": 1481.9761962890625, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 1.615264797507788, "grad_norm": 0.5990552306175232, "kl": 0.05413435027003288, "learning_rate": 1.35875e-06, "loss": 0.0117, "num_tokens": 135100695.0, "reward": 1.4653639793395996, "reward_std": 0.05351106822490692, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46536386013031006, "rewards/correct_reward_func/std": 0.13719916343688965, "step": 1037 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 1503.3214111328125, "completions/mean_terminated_length": 1503.3214111328125, "completions/min_length": 952.0, "completions/min_terminated_length": 952.0, "epoch": 1.616822429906542, "grad_norm": 0.6220361590385437, "kl": 0.052610600367188454, "learning_rate": 1.358125e-06, "loss": -0.0101, "num_tokens": 135232962.0, "reward": 1.5128328800201416, "reward_std": 0.055453550070524216, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5128328204154968, "rewards/correct_reward_func/std": 0.1644771248102188, "step": 1038 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3425.0, "completions/max_terminated_length": 3425.0, "completions/mean_length": 1552.8929443359375, "completions/mean_terminated_length": 1552.8929443359375, "completions/min_length": 1050.0, "completions/min_terminated_length": 1050.0, "epoch": 1.618380062305296, "grad_norm": 0.5130976438522339, "kl": 0.053666651248931885, "learning_rate": 1.3575e-06, "loss": -0.0076, "num_tokens": 135369519.0, "reward": 1.526445746421814, "reward_std": 0.05217195674777031, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5264455676078796, "rewards/correct_reward_func/std": 0.16648080945014954, "step": 1039 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3257.0, "completions/max_terminated_length": 3257.0, "completions/mean_length": 1544.0595703125, "completions/mean_terminated_length": 1544.0595703125, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "epoch": 1.6199376947040498, "grad_norm": 0.5726479887962341, "kl": 0.05157448537647724, "learning_rate": 1.356875e-06, "loss": -0.0154, "num_tokens": 135505424.0, "reward": 1.521503210067749, "reward_std": 0.05375625565648079, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5215030908584595, "rewards/correct_reward_func/std": 0.14469487965106964, "step": 1040 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 1633.1905517578125, "completions/mean_terminated_length": 1633.1905517578125, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 1.6214953271028039, "grad_norm": 0.5411516427993774, "kl": 0.051717083901166916, "learning_rate": 1.35625e-06, "loss": 0.0155, "num_tokens": 135648744.0, "reward": 1.5200222730636597, "reward_std": 0.07136886566877365, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5200221538543701, "rewards/correct_reward_func/std": 0.16770030558109283, "step": 1041 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1519.5357666015625, "completions/mean_terminated_length": 1519.5357666015625, "completions/min_length": 972.0, "completions/min_terminated_length": 972.0, "epoch": 1.6230529595015577, "grad_norm": 0.5928486585617065, "kl": 0.053753653541207314, "learning_rate": 1.355625e-06, "loss": -0.0071, "num_tokens": 135782319.0, "reward": 1.4707903861999512, "reward_std": 0.06930838525295258, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47079023718833923, "rewards/correct_reward_func/std": 0.16793005168437958, "step": 1042 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2365.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 1541.047607421875, "completions/mean_terminated_length": 1541.047607421875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 1.6246105919003115, "grad_norm": 0.5260264873504639, "kl": 0.054449863731861115, "learning_rate": 1.355e-06, "loss": 0.0169, "num_tokens": 135917485.0, "reward": 1.464097261428833, "reward_std": 0.050796028226614, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46409717202186584, "rewards/correct_reward_func/std": 0.1619662493467331, "step": 1043 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1546.1905517578125, "completions/mean_terminated_length": 1546.1905517578125, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 1.6261682242990654, "grad_norm": 0.5629999041557312, "kl": 0.05364096164703369, "learning_rate": 1.354375e-06, "loss": 0.0071, "num_tokens": 136053221.0, "reward": 1.4347970485687256, "reward_std": 0.07657790184020996, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44670167565345764, "rewards/correct_reward_func/std": 0.1376093178987503, "step": 1044 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3403.0, "completions/max_terminated_length": 3403.0, "completions/mean_length": 1530.40478515625, "completions/mean_terminated_length": 1530.40478515625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 1.6277258566978192, "grad_norm": 0.5891228914260864, "kl": 0.05658549815416336, "learning_rate": 1.35375e-06, "loss": 0.0363, "num_tokens": 136187685.0, "reward": 1.4625400304794312, "reward_std": 0.05683088302612305, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.462539941072464, "rewards/correct_reward_func/std": 0.12948718667030334, "step": 1045 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 1615.2381591796875, "completions/mean_terminated_length": 1615.2381591796875, "completions/min_length": 1101.0, "completions/min_terminated_length": 1101.0, "epoch": 1.629283489096573, "grad_norm": 0.5390412211418152, "kl": 0.05128757283091545, "learning_rate": 1.3531249999999999e-06, "loss": 0.0082, "num_tokens": 136329263.0, "reward": 1.4278221130371094, "reward_std": 0.07720060646533966, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4397267699241638, "rewards/correct_reward_func/std": 0.10165135562419891, "step": 1046 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 1576.4405517578125, "completions/mean_terminated_length": 1576.4405517578125, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "epoch": 1.6308411214953271, "grad_norm": 0.5377297401428223, "kl": 0.052540283650159836, "learning_rate": 1.3525e-06, "loss": 0.0175, "num_tokens": 136467696.0, "reward": 1.5124995708465576, "reward_std": 0.04360097274184227, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5124995708465576, "rewards/correct_reward_func/std": 0.15979255735874176, "step": 1047 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3179.0, "completions/max_terminated_length": 3179.0, "completions/mean_length": 1554.011962890625, "completions/mean_terminated_length": 1554.011962890625, "completions/min_length": 1072.0, "completions/min_terminated_length": 1072.0, "epoch": 1.632398753894081, "grad_norm": 0.5838060975074768, "kl": 0.05435189604759216, "learning_rate": 1.3518749999999999e-06, "loss": 0.0334, "num_tokens": 136604221.0, "reward": 1.5408968925476074, "reward_std": 0.05577825382351875, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5408968925476074, "rewards/correct_reward_func/std": 0.18390248715877533, "step": 1048 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2710.0, "completions/max_terminated_length": 2710.0, "completions/mean_length": 1569.96435546875, "completions/mean_terminated_length": 1569.96435546875, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "epoch": 1.633956386292835, "grad_norm": 0.5855342745780945, "kl": 0.05437758192420006, "learning_rate": 1.35125e-06, "loss": 0.0285, "num_tokens": 136741918.0, "reward": 1.4691810607910156, "reward_std": 0.051512788981199265, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4691811203956604, "rewards/correct_reward_func/std": 0.1329197734594345, "step": 1049 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 1510.9285888671875, "completions/mean_terminated_length": 1510.9285888671875, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 1.6355140186915889, "grad_norm": 0.5731550455093384, "kl": 0.054405104368925095, "learning_rate": 1.3506249999999999e-06, "loss": -0.0065, "num_tokens": 136874722.0, "reward": 1.508426547050476, "reward_std": 0.039651330560445786, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5084263682365417, "rewards/correct_reward_func/std": 0.16465696692466736, "step": 1050 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2168.0, "completions/max_terminated_length": 2168.0, "completions/mean_length": 1565.8929443359375, "completions/mean_terminated_length": 1565.8929443359375, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 1.6370716510903427, "grad_norm": 0.5914250612258911, "kl": 0.05762408673763275, "learning_rate": 1.35e-06, "loss": -0.0167, "num_tokens": 137012131.0, "reward": 1.3974400758743286, "reward_std": 0.08153515309095383, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.40934473276138306, "rewards/correct_reward_func/std": 0.15768387913703918, "step": 1051 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2370.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 1622.9405517578125, "completions/mean_terminated_length": 1622.9405517578125, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 1.6386292834890965, "grad_norm": 0.5383678674697876, "kl": 0.05545662343502045, "learning_rate": 1.3493749999999999e-06, "loss": 0.0008, "num_tokens": 137154440.0, "reward": 1.5292316675186157, "reward_std": 0.04267982766032219, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5292316675186157, "rewards/correct_reward_func/std": 0.16781207919120789, "step": 1052 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 1512.3690185546875, "completions/mean_terminated_length": 1512.3690185546875, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 1.6401869158878504, "grad_norm": 0.5991505980491638, "kl": 0.05531277321279049, "learning_rate": 1.3487499999999998e-06, "loss": 0.0039, "num_tokens": 137287293.0, "reward": 1.4677830934524536, "reward_std": 0.0783349946141243, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4796878397464752, "rewards/correct_reward_func/std": 0.16522300243377686, "step": 1053 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1547.0238037109375, "completions/mean_terminated_length": 1547.0238037109375, "completions/min_length": 1088.0, "completions/min_terminated_length": 1088.0, "epoch": 1.6417445482866042, "grad_norm": 0.5571765303611755, "kl": 0.054872432723641396, "learning_rate": 1.348125e-06, "loss": -0.006, "num_tokens": 137423153.0, "reward": 1.4879200458526611, "reward_std": 0.061748720705509186, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48791995644569397, "rewards/correct_reward_func/std": 0.17064164578914642, "step": 1054 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 1538.5238037109375, "completions/mean_terminated_length": 1538.5238037109375, "completions/min_length": 336.0, "completions/min_terminated_length": 336.0, "epoch": 1.6433021806853583, "grad_norm": 0.5635541081428528, "kl": 0.05561501905322075, "learning_rate": 1.3474999999999998e-06, "loss": -0.0271, "num_tokens": 137558395.0, "reward": 1.4898490905761719, "reward_std": 0.17587438225746155, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.5374680757522583, "rewards/correct_reward_func/std": 0.1749303787946701, "step": 1055 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 1577.7857666015625, "completions/mean_terminated_length": 1577.7857666015625, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "epoch": 1.644859813084112, "grad_norm": 0.5854898691177368, "kl": 0.05366738140583038, "learning_rate": 1.346875e-06, "loss": 0.015, "num_tokens": 137697247.0, "reward": 1.512568712234497, "reward_std": 0.07942016422748566, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5244733691215515, "rewards/correct_reward_func/std": 0.14364567399024963, "step": 1056 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 1541.0238037109375, "completions/mean_terminated_length": 1541.0238037109375, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 1.6464174454828662, "grad_norm": 0.5717303156852722, "kl": 0.05528515763580799, "learning_rate": 1.3462499999999998e-06, "loss": -0.0003, "num_tokens": 137832453.0, "reward": 1.464308261871338, "reward_std": 0.050033025443553925, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4643082022666931, "rewards/correct_reward_func/std": 0.16159728169441223, "step": 1057 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2872.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 1476.916748046875, "completions/mean_terminated_length": 1476.916748046875, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 1.64797507788162, "grad_norm": 0.6368807554244995, "kl": 0.06014778092503548, "learning_rate": 1.345625e-06, "loss": -0.0226, "num_tokens": 137962472.0, "reward": 1.5432347059249878, "reward_std": 0.06146860867738724, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5432345271110535, "rewards/correct_reward_func/std": 0.17585954070091248, "step": 1058 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 1579.107177734375, "completions/mean_terminated_length": 1579.107177734375, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 1.6495327102803738, "grad_norm": 0.6340283155441284, "kl": 0.05687938071787357, "learning_rate": 1.3449999999999998e-06, "loss": -0.0136, "num_tokens": 138101327.0, "reward": 1.482109785079956, "reward_std": 0.05752336606383324, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48210975527763367, "rewards/correct_reward_func/std": 0.14570102095603943, "step": 1059 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2802.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 1518.5357666015625, "completions/mean_terminated_length": 1518.5357666015625, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 1.6510903426791277, "grad_norm": 0.6107741594314575, "kl": 0.055661765858531, "learning_rate": 1.344375e-06, "loss": 0.0224, "num_tokens": 138234698.0, "reward": 1.4427367448806763, "reward_std": 0.109878771007061, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4665461480617523, "rewards/correct_reward_func/std": 0.14121823012828827, "step": 1060 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2351.0, "completions/max_terminated_length": 2351.0, "completions/mean_length": 1640.107177734375, "completions/mean_terminated_length": 1640.107177734375, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 1.6526479750778815, "grad_norm": 0.5215502977371216, "kl": 0.054962895810604095, "learning_rate": 1.3437499999999998e-06, "loss": -0.0108, "num_tokens": 138378545.0, "reward": 1.4496138095855713, "reward_std": 0.0706188976764679, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4496137201786041, "rewards/correct_reward_func/std": 0.14065679907798767, "step": 1061 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 1583.666748046875, "completions/mean_terminated_length": 1583.666748046875, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 1.6542056074766354, "grad_norm": 0.5921149849891663, "kl": 0.0587878804653883, "learning_rate": 1.3431249999999997e-06, "loss": 0.0053, "num_tokens": 138517525.0, "reward": 1.4806667566299438, "reward_std": 0.0666496753692627, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48066675662994385, "rewards/correct_reward_func/std": 0.14950628578662872, "step": 1062 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2588.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 1640.2381591796875, "completions/mean_terminated_length": 1640.2381591796875, "completions/min_length": 1109.0, "completions/min_terminated_length": 1109.0, "epoch": 1.6557632398753894, "grad_norm": 0.5615301132202148, "kl": 0.05401209369301796, "learning_rate": 1.3425e-06, "loss": -0.0211, "num_tokens": 138661371.0, "reward": 1.5019553899765015, "reward_std": 0.07899212092161179, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5138601064682007, "rewards/correct_reward_func/std": 0.14160452783107758, "step": 1063 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 1633.8809814453125, "completions/mean_terminated_length": 1633.8809814453125, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 1.6573208722741433, "grad_norm": 0.5847644209861755, "kl": 0.054801538586616516, "learning_rate": 1.341875e-06, "loss": -0.0245, "num_tokens": 138804701.0, "reward": 1.4788357019424438, "reward_std": 0.09718337655067444, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5026451945304871, "rewards/correct_reward_func/std": 0.13821066915988922, "step": 1064 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 1599.9761962890625, "completions/mean_terminated_length": 1599.9761962890625, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 1.6588785046728973, "grad_norm": 0.5800467729568481, "kl": 0.052408963441848755, "learning_rate": 1.34125e-06, "loss": 0.0003, "num_tokens": 138945129.0, "reward": 1.533974051475525, "reward_std": 0.06312627345323563, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5339741706848145, "rewards/correct_reward_func/std": 0.17329341173171997, "step": 1065 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2245.0, "completions/max_terminated_length": 2245.0, "completions/mean_length": 1632.75, "completions/mean_terminated_length": 1632.75, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 1.6604361370716512, "grad_norm": 0.582881510257721, "kl": 0.05381094291806221, "learning_rate": 1.340625e-06, "loss": -0.0087, "num_tokens": 139088250.0, "reward": 1.4607641696929932, "reward_std": 0.08779045939445496, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4726688861846924, "rewards/correct_reward_func/std": 0.10579022765159607, "step": 1066 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2828.0, "completions/max_terminated_length": 2828.0, "completions/mean_length": 1608.5357666015625, "completions/mean_terminated_length": 1608.5357666015625, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "epoch": 1.661993769470405, "grad_norm": 0.5866445302963257, "kl": 0.05647643841803074, "learning_rate": 1.34e-06, "loss": -0.0263, "num_tokens": 139229187.0, "reward": 1.496279239654541, "reward_std": 0.04081052541732788, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49627912044525146, "rewards/correct_reward_func/std": 0.17185096442699432, "step": 1067 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 1565.0833740234375, "completions/mean_terminated_length": 1565.0833740234375, "completions/min_length": 1068.0, "completions/min_terminated_length": 1068.0, "epoch": 1.6635514018691588, "grad_norm": 0.5948162078857422, "kl": 0.0556297842413187, "learning_rate": 1.339375e-06, "loss": 0.019, "num_tokens": 139366468.0, "reward": 1.535805106163025, "reward_std": 0.06959780305624008, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5358052253723145, "rewards/correct_reward_func/std": 0.1515055149793625, "step": 1068 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1580.702392578125, "completions/mean_terminated_length": 1580.702392578125, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "epoch": 1.6651090342679127, "grad_norm": 0.5873362421989441, "kl": 0.05339038372039795, "learning_rate": 1.33875e-06, "loss": -0.005, "num_tokens": 139505169.0, "reward": 1.5126603841781616, "reward_std": 0.053398918360471725, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5126603245735168, "rewards/correct_reward_func/std": 0.13039356470108032, "step": 1069 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 1588.261962890625, "completions/mean_terminated_length": 1588.261962890625, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 1.6666666666666665, "grad_norm": 0.5980243682861328, "kl": 0.05522442050278187, "learning_rate": 1.338125e-06, "loss": -0.0057, "num_tokens": 139644697.0, "reward": 1.5047221183776855, "reward_std": 0.06719934195280075, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5047219395637512, "rewards/correct_reward_func/std": 0.14364303648471832, "step": 1070 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2763.0, "completions/mean_length": 1621.5238037109375, "completions/mean_terminated_length": 1542.361328125, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "epoch": 1.6682242990654206, "grad_norm": 0.5721915364265442, "kl": 0.05282310023903847, "learning_rate": 1.3375e-06, "loss": 0.0564, "num_tokens": 139786767.0, "reward": 1.4976085424423218, "reward_std": 0.05854801833629608, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4976084232330322, "rewards/correct_reward_func/std": 0.20537886023521423, "step": 1071 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2815.0, "completions/max_terminated_length": 2815.0, "completions/mean_length": 1630.09521484375, "completions/mean_terminated_length": 1630.09521484375, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "epoch": 1.6697819314641744, "grad_norm": 0.5864342451095581, "kl": 0.05530725233256817, "learning_rate": 1.336875e-06, "loss": 0.0279, "num_tokens": 139929623.0, "reward": 1.5195698738098145, "reward_std": 0.08656013011932373, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5314745306968689, "rewards/correct_reward_func/std": 0.1751791387796402, "step": 1072 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2534.0, "completions/max_terminated_length": 2534.0, "completions/mean_length": 1622.7738037109375, "completions/mean_terminated_length": 1622.7738037109375, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 1.6713395638629285, "grad_norm": 0.5774338245391846, "kl": 0.05697022005915642, "learning_rate": 1.33625e-06, "loss": -0.0225, "num_tokens": 140072014.0, "reward": 1.532922625541687, "reward_std": 0.07154227048158646, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.532922625541687, "rewards/correct_reward_func/std": 0.12217090278863907, "step": 1073 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 1480.1309814453125, "completions/mean_terminated_length": 1480.1309814453125, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 1.6728971962616823, "grad_norm": 0.6423328518867493, "kl": 0.055289650335907936, "learning_rate": 1.335625e-06, "loss": -0.0297, "num_tokens": 140202183.0, "reward": 1.5034701824188232, "reward_std": 0.06442193686962128, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5034701228141785, "rewards/correct_reward_func/std": 0.18402770161628723, "step": 1074 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2287.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 1535.0238037109375, "completions/mean_terminated_length": 1535.0238037109375, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 1.6744548286604362, "grad_norm": 0.5982204675674438, "kl": 0.05604429915547371, "learning_rate": 1.335e-06, "loss": 0.0109, "num_tokens": 140337095.0, "reward": 1.482134222984314, "reward_std": 0.10136926919221878, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5059436559677124, "rewards/correct_reward_func/std": 0.15971846878528595, "step": 1075 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 1592.952392578125, "completions/mean_terminated_length": 1592.952392578125, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 1.67601246105919, "grad_norm": 0.5815662741661072, "kl": 0.054308902472257614, "learning_rate": 1.334375e-06, "loss": 0.0098, "num_tokens": 140476735.0, "reward": 1.5599098205566406, "reward_std": 0.07738285511732101, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5718144178390503, "rewards/correct_reward_func/std": 0.19236153364181519, "step": 1076 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2782.0, "completions/max_terminated_length": 2782.0, "completions/mean_length": 1569.3095703125, "completions/mean_terminated_length": 1569.3095703125, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 1.6775700934579438, "grad_norm": 0.5611425638198853, "kl": 0.05592833273112774, "learning_rate": 1.33375e-06, "loss": 0.0402, "num_tokens": 140614509.0, "reward": 1.5260337591171265, "reward_std": 0.101032555103302, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5498432517051697, "rewards/correct_reward_func/std": 0.20826658606529236, "step": 1077 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 1562.5357666015625, "completions/mean_terminated_length": 1562.5357666015625, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 1.6791277258566977, "grad_norm": 0.5770960450172424, "kl": 0.05291999317705631, "learning_rate": 1.3331249999999998e-06, "loss": 0.0167, "num_tokens": 140751762.0, "reward": 1.5483450889587402, "reward_std": 0.05665075406432152, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5483450293540955, "rewards/correct_reward_func/std": 0.18875320255756378, "step": 1078 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3456.0, "completions/max_terminated_length": 3456.0, "completions/mean_length": 1638.6429443359375, "completions/mean_terminated_length": 1638.6429443359375, "completions/min_length": 1068.0, "completions/min_terminated_length": 1068.0, "epoch": 1.6806853582554517, "grad_norm": 0.5381429195404053, "kl": 0.056252043694257736, "learning_rate": 1.3325e-06, "loss": -0.004, "num_tokens": 140895390.0, "reward": 1.4849931001663208, "reward_std": 0.0644674226641655, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4849930703639984, "rewards/correct_reward_func/std": 0.16995525360107422, "step": 1079 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 1534.011962890625, "completions/mean_terminated_length": 1534.011962890625, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 1.6822429906542056, "grad_norm": 0.6199350953102112, "kl": 0.054801203310489655, "learning_rate": 1.3318749999999998e-06, "loss": -0.0224, "num_tokens": 141030067.0, "reward": 1.5003803968429565, "reward_std": 0.06060957908630371, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5003802180290222, "rewards/correct_reward_func/std": 0.13026180863380432, "step": 1080 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 1533.4285888671875, "completions/mean_terminated_length": 1533.4285888671875, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 1.6838006230529596, "grad_norm": 0.6023419499397278, "kl": 0.05653490498661995, "learning_rate": 1.33125e-06, "loss": -0.0007, "num_tokens": 141164827.0, "reward": 1.4618875980377197, "reward_std": 0.08731655776500702, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47379231452941895, "rewards/correct_reward_func/std": 0.17352761328220367, "step": 1081 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1517.5, "completions/mean_terminated_length": 1517.5, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 1.6853582554517135, "grad_norm": 0.5876558423042297, "kl": 0.055033523589372635, "learning_rate": 1.3306249999999999e-06, "loss": -0.0173, "num_tokens": 141298423.0, "reward": 1.480236291885376, "reward_std": 0.06154443696141243, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48023614287376404, "rewards/correct_reward_func/std": 0.1546950489282608, "step": 1082 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 1526.2738037109375, "completions/mean_terminated_length": 1526.2738037109375, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 1.6869158878504673, "grad_norm": 0.6123580932617188, "kl": 0.05496268346905708, "learning_rate": 1.33e-06, "loss": 0.0244, "num_tokens": 141432600.0, "reward": 1.4757575988769531, "reward_std": 0.056909676641225815, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4757574498653412, "rewards/correct_reward_func/std": 0.14030733704566956, "step": 1083 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1451.34521484375, "completions/mean_terminated_length": 1451.34521484375, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 1.6884735202492211, "grad_norm": 0.5988257527351379, "kl": 0.05425653047859669, "learning_rate": 1.3293749999999999e-06, "loss": -0.0155, "num_tokens": 141560441.0, "reward": 1.5010333061218262, "reward_std": 0.0469030924141407, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5010332465171814, "rewards/correct_reward_func/std": 0.1944483369588852, "step": 1084 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2252.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 1449.6429443359375, "completions/mean_terminated_length": 1449.6429443359375, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 1.690031152647975, "grad_norm": 0.606391966342926, "kl": 0.05457920953631401, "learning_rate": 1.32875e-06, "loss": -0.0084, "num_tokens": 141688013.0, "reward": 1.5308442115783691, "reward_std": 0.057608503848314285, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5308440923690796, "rewards/correct_reward_func/std": 0.15508195757865906, "step": 1085 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2196.0, "completions/max_terminated_length": 2196.0, "completions/mean_length": 1517.3214111328125, "completions/mean_terminated_length": 1517.3214111328125, "completions/min_length": 999.0, "completions/min_terminated_length": 999.0, "epoch": 1.6915887850467288, "grad_norm": 0.5719998478889465, "kl": 0.0545782595872879, "learning_rate": 1.3281249999999999e-06, "loss": 0.0014, "num_tokens": 141821450.0, "reward": 1.525020956993103, "reward_std": 0.06384439021348953, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5250208973884583, "rewards/correct_reward_func/std": 0.18837735056877136, "step": 1086 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2825.0, "completions/max_terminated_length": 2825.0, "completions/mean_length": 1563.3690185546875, "completions/mean_terminated_length": 1563.3690185546875, "completions/min_length": 1050.0, "completions/min_terminated_length": 1050.0, "epoch": 1.6931464174454829, "grad_norm": 0.5464904308319092, "kl": 0.05209500156342983, "learning_rate": 1.3274999999999998e-06, "loss": 0.0005, "num_tokens": 141959049.0, "reward": 1.506603479385376, "reward_std": 0.05781431496143341, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5066033601760864, "rewards/correct_reward_func/std": 0.16205404698848724, "step": 1087 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2334.0, "completions/max_terminated_length": 2334.0, "completions/mean_length": 1468.71435546875, "completions/mean_terminated_length": 1468.71435546875, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 1.6947040498442367, "grad_norm": 0.6574540734291077, "kl": 0.056795092299580574, "learning_rate": 1.326875e-06, "loss": 0.0118, "num_tokens": 142088277.0, "reward": 1.4825557470321655, "reward_std": 0.08934339135885239, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49446043372154236, "rewards/correct_reward_func/std": 0.15667720139026642, "step": 1088 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2245.0, "completions/max_terminated_length": 2245.0, "completions/mean_length": 1535.6905517578125, "completions/mean_terminated_length": 1535.6905517578125, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 1.6962616822429908, "grad_norm": 0.5786517858505249, "kl": 0.055348386988043785, "learning_rate": 1.3262499999999998e-06, "loss": -0.011, "num_tokens": 142223215.0, "reward": 1.5235230922698975, "reward_std": 0.06262627989053726, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5235230326652527, "rewards/correct_reward_func/std": 0.18882040679454803, "step": 1089 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2230.0, "completions/max_terminated_length": 2230.0, "completions/mean_length": 1405.607177734375, "completions/mean_terminated_length": 1405.607177734375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 1.6978193146417446, "grad_norm": 0.6453804969787598, "kl": 0.054967125877738, "learning_rate": 1.325625e-06, "loss": -0.0398, "num_tokens": 142347166.0, "reward": 1.4519461393356323, "reward_std": 0.09238488227128983, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46385079622268677, "rewards/correct_reward_func/std": 0.11289027333259583, "step": 1090 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 1491.4881591796875, "completions/mean_terminated_length": 1491.4881591796875, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 1.6993769470404985, "grad_norm": 0.5896411538124084, "kl": 0.05317670479416847, "learning_rate": 1.3249999999999998e-06, "loss": 0.0146, "num_tokens": 142478487.0, "reward": 1.5271276235580444, "reward_std": 0.087456613779068, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5509369969367981, "rewards/correct_reward_func/std": 0.18444468080997467, "step": 1091 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1400.511962890625, "completions/mean_terminated_length": 1400.511962890625, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 1.7009345794392523, "grad_norm": 0.6094242930412292, "kl": 0.05540328659117222, "learning_rate": 1.324375e-06, "loss": 0.0018, "num_tokens": 142602046.0, "reward": 1.5512709617614746, "reward_std": 0.060811061412096024, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5512707829475403, "rewards/correct_reward_func/std": 0.17814837396144867, "step": 1092 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2671.0, "completions/mean_length": 1549.2857666015625, "completions/mean_terminated_length": 1469.2529296875, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "epoch": 1.7024922118380061, "grad_norm": 0.5704352259635925, "kl": 0.05448896810412407, "learning_rate": 1.3237499999999998e-06, "loss": 0.0621, "num_tokens": 142738288.0, "reward": 1.41412353515625, "reward_std": 0.08829192072153091, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42602819204330444, "rewards/correct_reward_func/std": 0.1305420696735382, "step": 1093 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2229.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1511.607177734375, "completions/mean_terminated_length": 1511.607177734375, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "epoch": 1.70404984423676, "grad_norm": 0.5821900367736816, "kl": 0.055560242384672165, "learning_rate": 1.3231250000000001e-06, "loss": 0.013, "num_tokens": 142871647.0, "reward": 1.5410221815109253, "reward_std": 0.04229526221752167, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.541022002696991, "rewards/correct_reward_func/std": 0.18999060988426208, "step": 1094 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1469.6905517578125, "completions/mean_terminated_length": 1469.6905517578125, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "epoch": 1.705607476635514, "grad_norm": 0.6663801074028015, "kl": 0.0542462132871151, "learning_rate": 1.3225e-06, "loss": 0.0059, "num_tokens": 143000987.0, "reward": 1.4598166942596436, "reward_std": 0.08724696189165115, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47172147035598755, "rewards/correct_reward_func/std": 0.12283144146203995, "step": 1095 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2234.0, "completions/mean_length": 1544.2738037109375, "completions/mean_terminated_length": 1464.1806640625, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "epoch": 1.7071651090342679, "grad_norm": 0.6034864187240601, "kl": 0.05409064143896103, "learning_rate": 1.321875e-06, "loss": 0.0436, "num_tokens": 143136730.0, "reward": 1.4459140300750732, "reward_std": 0.062484294176101685, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4459139406681061, "rewards/correct_reward_func/std": 0.11880803108215332, "step": 1096 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1475.357177734375, "completions/mean_terminated_length": 1475.357177734375, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 1.708722741433022, "grad_norm": 0.6146649122238159, "kl": 0.0549653135240078, "learning_rate": 1.32125e-06, "loss": 0.0078, "num_tokens": 143266876.0, "reward": 1.4944621324539185, "reward_std": 0.04716341942548752, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49446216225624084, "rewards/correct_reward_func/std": 0.11421861499547958, "step": 1097 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 1380.392822265625, "completions/mean_terminated_length": 1380.392822265625, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 1.7102803738317758, "grad_norm": 0.6232919692993164, "kl": 0.05559522844851017, "learning_rate": 1.320625e-06, "loss": 0.0163, "num_tokens": 143388619.0, "reward": 1.516537070274353, "reward_std": 0.09244091063737869, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5165370106697083, "rewards/correct_reward_func/std": 0.15545852482318878, "step": 1098 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 1409.1785888671875, "completions/mean_terminated_length": 1409.1785888671875, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 1.7118380062305296, "grad_norm": 0.6016949415206909, "kl": 0.05453530512750149, "learning_rate": 1.32e-06, "loss": 0.0138, "num_tokens": 143512978.0, "reward": 1.5119688510894775, "reward_std": 0.05214161053299904, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.511968731880188, "rewards/correct_reward_func/std": 0.16375254094600677, "step": 1099 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1436.34521484375, "completions/mean_terminated_length": 1354.9517822265625, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "epoch": 1.7133956386292835, "grad_norm": 0.6520361304283142, "kl": 0.054973041638731956, "learning_rate": 1.319375e-06, "loss": 0.047, "num_tokens": 143639583.0, "reward": 1.474164366722107, "reward_std": 0.05344012752175331, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4741641879081726, "rewards/correct_reward_func/std": 0.13808943331241608, "step": 1100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 1427.0595703125, "completions/mean_terminated_length": 1427.0595703125, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 1.7149532710280373, "grad_norm": 0.605894148349762, "kl": 0.05449598841369152, "learning_rate": 1.31875e-06, "loss": -0.0016, "num_tokens": 143765408.0, "reward": 1.5508211851119995, "reward_std": 0.06333474069833755, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.55082106590271, "rewards/correct_reward_func/std": 0.15021193027496338, "step": 1101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2135.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 1426.452392578125, "completions/mean_terminated_length": 1426.452392578125, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "epoch": 1.7165109034267911, "grad_norm": 0.6389424800872803, "kl": 0.05553033761680126, "learning_rate": 1.318125e-06, "loss": 0.0191, "num_tokens": 143891242.0, "reward": 1.5053493976593018, "reward_std": 0.05772963538765907, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5053492784500122, "rewards/correct_reward_func/std": 0.15680259466171265, "step": 1102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2133.0, "completions/max_terminated_length": 2133.0, "completions/mean_length": 1420.9405517578125, "completions/mean_terminated_length": 1420.9405517578125, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 1.7180685358255452, "grad_norm": 0.5900627374649048, "kl": 0.059002652764320374, "learning_rate": 1.3174999999999999e-06, "loss": -0.0028, "num_tokens": 144016523.0, "reward": 1.4361919164657593, "reward_std": 0.10630304366350174, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44809669256210327, "rewards/correct_reward_func/std": 0.19144123792648315, "step": 1103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1375.357177734375, "completions/mean_terminated_length": 1375.357177734375, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 1.719626168224299, "grad_norm": 0.6294445395469666, "kl": 0.056898972019553185, "learning_rate": 1.316875e-06, "loss": -0.0283, "num_tokens": 144137987.0, "reward": 1.5388362407684326, "reward_std": 0.05523570626974106, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5388362407684326, "rewards/correct_reward_func/std": 0.13087205588817596, "step": 1104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 1432.0238037109375, "completions/mean_terminated_length": 1432.0238037109375, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 1.721183800623053, "grad_norm": 0.5791958570480347, "kl": 0.057295188307762146, "learning_rate": 1.3162499999999999e-06, "loss": -0.015, "num_tokens": 144264211.0, "reward": 1.5195956230163574, "reward_std": 0.08771070092916489, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5315002202987671, "rewards/correct_reward_func/std": 0.1867634356021881, "step": 1105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2203.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 1532.047607421875, "completions/mean_terminated_length": 1532.047607421875, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 1.722741433021807, "grad_norm": 0.5984449982643127, "kl": 0.05484708957374096, "learning_rate": 1.315625e-06, "loss": 0.0142, "num_tokens": 144399173.0, "reward": 1.5028140544891357, "reward_std": 0.07174117118120193, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5028138756752014, "rewards/correct_reward_func/std": 0.14959195256233215, "step": 1106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 1408.452392578125, "completions/mean_terminated_length": 1408.452392578125, "completions/min_length": 965.0, "completions/min_terminated_length": 965.0, "epoch": 1.7242990654205608, "grad_norm": 0.6338729858398438, "kl": 0.05605399049818516, "learning_rate": 1.315e-06, "loss": -0.0113, "num_tokens": 144523531.0, "reward": 1.554080605506897, "reward_std": 0.05395637825131416, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5540804266929626, "rewards/correct_reward_func/std": 0.14322131872177124, "step": 1107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 1479.8690185546875, "completions/mean_terminated_length": 1479.8690185546875, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "epoch": 1.7258566978193146, "grad_norm": 0.600147545337677, "kl": 0.05932756885886192, "learning_rate": 1.314375e-06, "loss": -0.0367, "num_tokens": 144653846.0, "reward": 1.4748127460479736, "reward_std": 0.06064697727560997, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47481268644332886, "rewards/correct_reward_func/std": 0.19672922790050507, "step": 1108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 1457.011962890625, "completions/mean_terminated_length": 1457.011962890625, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 1.7274143302180685, "grad_norm": 0.6160877346992493, "kl": 0.05862598493695259, "learning_rate": 1.31375e-06, "loss": 0.0016, "num_tokens": 144782175.0, "reward": 1.460358738899231, "reward_std": 0.07062016427516937, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4603586792945862, "rewards/correct_reward_func/std": 0.16299869120121002, "step": 1109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 1407.4761962890625, "completions/mean_terminated_length": 1407.4761962890625, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 1.7289719626168223, "grad_norm": 0.6253482699394226, "kl": 0.055128734558820724, "learning_rate": 1.313125e-06, "loss": 0.0057, "num_tokens": 144906169.0, "reward": 1.5652191638946533, "reward_std": 0.04696870967745781, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5652191638946533, "rewards/correct_reward_func/std": 0.14143693447113037, "step": 1110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1424.511962890625, "completions/mean_terminated_length": 1424.511962890625, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "epoch": 1.7305295950155763, "grad_norm": 0.6093751788139343, "kl": 0.05493195168673992, "learning_rate": 1.3125e-06, "loss": -0.0023, "num_tokens": 145031690.0, "reward": 1.4864121675491333, "reward_std": 0.05021434277296066, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4864121377468109, "rewards/correct_reward_func/std": 0.14364133775234222, "step": 1111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2182.0, "completions/max_terminated_length": 2182.0, "completions/mean_length": 1446.047607421875, "completions/mean_terminated_length": 1446.047607421875, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 1.7320872274143302, "grad_norm": 0.5680668950080872, "kl": 0.055835017934441566, "learning_rate": 1.3118749999999998e-06, "loss": -0.0037, "num_tokens": 145159110.0, "reward": 1.4660099744796753, "reward_std": 0.0987553820014, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4898194670677185, "rewards/correct_reward_func/std": 0.19845271110534668, "step": 1112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 1497.09521484375, "completions/mean_terminated_length": 1497.09521484375, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "epoch": 1.7336448598130842, "grad_norm": 0.5553818345069885, "kl": 0.053073033690452576, "learning_rate": 1.31125e-06, "loss": -0.0156, "num_tokens": 145290962.0, "reward": 1.5105202198028564, "reward_std": 0.09678389132022858, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5224248766899109, "rewards/correct_reward_func/std": 0.15707534551620483, "step": 1113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1541.0714111328125, "completions/mean_terminated_length": 1460.939697265625, "completions/min_length": 1022.0, "completions/min_terminated_length": 1022.0, "epoch": 1.735202492211838, "grad_norm": 0.5799816846847534, "kl": 0.053622497245669365, "learning_rate": 1.3106249999999998e-06, "loss": 0.0383, "num_tokens": 145426388.0, "reward": 1.4927005767822266, "reward_std": 0.06569858640432358, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4927005171775818, "rewards/correct_reward_func/std": 0.16057810187339783, "step": 1114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2286.0, "completions/max_terminated_length": 2286.0, "completions/mean_length": 1603.4881591796875, "completions/mean_terminated_length": 1603.4881591796875, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "epoch": 1.736760124610592, "grad_norm": 0.5952714085578918, "kl": 0.06090260297060013, "learning_rate": 1.31e-06, "loss": 0.0038, "num_tokens": 145567369.0, "reward": 1.5022380352020264, "reward_std": 0.05294782295823097, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5022379159927368, "rewards/correct_reward_func/std": 0.1142892986536026, "step": 1115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1599.702392578125, "completions/mean_terminated_length": 1599.702392578125, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "epoch": 1.7383177570093458, "grad_norm": 0.5825952291488647, "kl": 0.05465948395431042, "learning_rate": 1.3093749999999999e-06, "loss": 0.0218, "num_tokens": 145707672.0, "reward": 1.495916485786438, "reward_std": 0.09848009049892426, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5078211426734924, "rewards/correct_reward_func/std": 0.1747811734676361, "step": 1116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1491.797607421875, "completions/mean_terminated_length": 1491.797607421875, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 1.7398753894080996, "grad_norm": 0.6028727889060974, "kl": 0.05651978775858879, "learning_rate": 1.30875e-06, "loss": 0.0046, "num_tokens": 145838881.0, "reward": 1.5291047096252441, "reward_std": 0.06624387949705124, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5291045904159546, "rewards/correct_reward_func/std": 0.1508786380290985, "step": 1117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 1547.8809814453125, "completions/mean_terminated_length": 1547.8809814453125, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "epoch": 1.7414330218068534, "grad_norm": 0.6244828701019287, "kl": 0.0559681486338377, "learning_rate": 1.3081249999999999e-06, "loss": 0.0162, "num_tokens": 145974867.0, "reward": 1.3941341638565063, "reward_std": 0.11291488260030746, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.41794368624687195, "rewards/correct_reward_func/std": 0.12424509972333908, "step": 1118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 1546.59521484375, "completions/mean_terminated_length": 1546.59521484375, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 1.7429906542056075, "grad_norm": 0.56931471824646, "kl": 0.05700571648776531, "learning_rate": 1.3075e-06, "loss": 0.0031, "num_tokens": 146110703.0, "reward": 1.4941818714141846, "reward_std": 0.0815611258149147, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.506086528301239, "rewards/correct_reward_func/std": 0.16775766015052795, "step": 1119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 1628.0, "completions/mean_terminated_length": 1628.0, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 1.7445482866043613, "grad_norm": 0.5838506817817688, "kl": 0.055558497086167336, "learning_rate": 1.3068749999999999e-06, "loss": 0.0079, "num_tokens": 146253557.0, "reward": 1.496366024017334, "reward_std": 0.0907699316740036, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5082707405090332, "rewards/correct_reward_func/std": 0.14823462069034576, "step": 1120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 1579.5833740234375, "completions/mean_terminated_length": 1579.5833740234375, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "epoch": 1.7461059190031154, "grad_norm": 0.6281409859657288, "kl": 0.05499861761927605, "learning_rate": 1.3062499999999998e-06, "loss": -0.0144, "num_tokens": 146392134.0, "reward": 1.4551624059677124, "reward_std": 0.07651150971651077, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4670672118663788, "rewards/correct_reward_func/std": 0.1533525586128235, "step": 1121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2883.0, "completions/max_terminated_length": 2883.0, "completions/mean_length": 1576.3809814453125, "completions/mean_terminated_length": 1576.3809814453125, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 1.7476635514018692, "grad_norm": 0.6155412793159485, "kl": 0.056425297632813454, "learning_rate": 1.3056249999999999e-06, "loss": 0.009, "num_tokens": 146530592.0, "reward": 1.5117181539535522, "reward_std": 0.063330739736557, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5117180943489075, "rewards/correct_reward_func/std": 0.13607822358608246, "step": 1122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2427.0, "completions/max_terminated_length": 2427.0, "completions/mean_length": 1599.59521484375, "completions/mean_terminated_length": 1599.59521484375, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 1.749221183800623, "grad_norm": 0.5919455289840698, "kl": 0.0546236839145422, "learning_rate": 1.3049999999999998e-06, "loss": 0.033, "num_tokens": 146671006.0, "reward": 1.4846004247665405, "reward_std": 0.05139493942260742, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4846002459526062, "rewards/correct_reward_func/std": 0.1640039086341858, "step": 1123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1594.3214111328125, "completions/mean_terminated_length": 1594.3214111328125, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "epoch": 1.750778816199377, "grad_norm": 0.5721678733825684, "kl": 0.05400045961141586, "learning_rate": 1.304375e-06, "loss": -0.0068, "num_tokens": 146810857.0, "reward": 1.4846551418304443, "reward_std": 0.07246145606040955, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4846550524234772, "rewards/correct_reward_func/std": 0.1400829702615738, "step": 1124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2776.0, "completions/max_terminated_length": 2776.0, "completions/mean_length": 1637.7857666015625, "completions/mean_terminated_length": 1637.7857666015625, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 1.7523364485981308, "grad_norm": 1.4375020265579224, "kl": 0.0829340610653162, "learning_rate": 1.30375e-06, "loss": 0.0117, "num_tokens": 146954533.0, "reward": 1.5203410387039185, "reward_std": 0.08337219059467316, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5322458148002625, "rewards/correct_reward_func/std": 0.12477599829435349, "step": 1125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3025.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 1630.202392578125, "completions/mean_terminated_length": 1630.202392578125, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 1.7538940809968846, "grad_norm": 0.5690008997917175, "kl": 0.05563694052398205, "learning_rate": 1.3031250000000001e-06, "loss": 0.0385, "num_tokens": 147097392.0, "reward": 1.4745368957519531, "reward_std": 0.10618807375431061, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4864416718482971, "rewards/correct_reward_func/std": 0.15106239914894104, "step": 1126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3197.0, "completions/max_terminated_length": 3197.0, "completions/mean_length": 1640.75, "completions/mean_terminated_length": 1640.75, "completions/min_length": 1086.0, "completions/min_terminated_length": 1086.0, "epoch": 1.7554517133956387, "grad_norm": 0.5705302357673645, "kl": 0.055333059281110764, "learning_rate": 1.3025e-06, "loss": 0.0053, "num_tokens": 147241203.0, "reward": 1.4788492918014526, "reward_std": 0.06541267782449722, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4788493514060974, "rewards/correct_reward_func/std": 0.15010850131511688, "step": 1127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2222.0, "completions/max_terminated_length": 2222.0, "completions/mean_length": 1585.1429443359375, "completions/mean_terminated_length": 1585.1429443359375, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "epoch": 1.7570093457943925, "grad_norm": 0.5711000561714172, "kl": 0.05572010576725006, "learning_rate": 1.301875e-06, "loss": 0.0208, "num_tokens": 147380397.0, "reward": 1.4928436279296875, "reward_std": 0.14161552488803864, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.5285578966140747, "rewards/correct_reward_func/std": 0.1611003428697586, "step": 1128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3828.0, "completions/max_terminated_length": 3828.0, "completions/mean_length": 1599.3690185546875, "completions/mean_terminated_length": 1599.3690185546875, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "epoch": 1.7585669781931466, "grad_norm": 0.5605370402336121, "kl": 0.055966269224882126, "learning_rate": 1.30125e-06, "loss": -0.0042, "num_tokens": 147520594.0, "reward": 1.5011796951293945, "reward_std": 0.07264749705791473, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5011795163154602, "rewards/correct_reward_func/std": 0.14335981011390686, "step": 1129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 1747.25, "completions/mean_terminated_length": 1669.602294921875, "completions/min_length": 1133.0, "completions/min_terminated_length": 1133.0, "epoch": 1.7601246105919004, "grad_norm": 0.52415931224823, "kl": 0.05284489691257477, "learning_rate": 1.300625e-06, "loss": 0.0758, "num_tokens": 147673579.0, "reward": 1.4761098623275757, "reward_std": 0.11628463119268417, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48801448941230774, "rewards/correct_reward_func/std": 0.1610371619462967, "step": 1130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2286.0, "completions/max_terminated_length": 2286.0, "completions/mean_length": 1601.5357666015625, "completions/mean_terminated_length": 1601.5357666015625, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 1.7616822429906542, "grad_norm": 0.5671694278717041, "kl": 0.058677833527326584, "learning_rate": 1.3e-06, "loss": -0.028, "num_tokens": 147813910.0, "reward": 1.5925577878952026, "reward_std": 0.09395749121904373, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5925577878952026, "rewards/correct_reward_func/std": 0.20880362391471863, "step": 1131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2860.0, "completions/max_terminated_length": 2860.0, "completions/mean_length": 1539.047607421875, "completions/mean_terminated_length": 1539.047607421875, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "epoch": 1.763239875389408, "grad_norm": 0.6003912091255188, "kl": 0.05654078163206577, "learning_rate": 1.299375e-06, "loss": 0.0189, "num_tokens": 147949088.0, "reward": 1.5030018091201782, "reward_std": 0.06753882765769958, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5030016899108887, "rewards/correct_reward_func/std": 0.16837167739868164, "step": 1132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2639.0, "completions/max_terminated_length": 2639.0, "completions/mean_length": 1571.202392578125, "completions/mean_terminated_length": 1571.202392578125, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 1.764797507788162, "grad_norm": 0.5741766095161438, "kl": 0.05285065062344074, "learning_rate": 1.29875e-06, "loss": -0.0218, "num_tokens": 148087279.0, "reward": 1.4853522777557373, "reward_std": 0.06807680428028107, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4853522777557373, "rewards/correct_reward_func/std": 0.16334594786167145, "step": 1133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 1654.8929443359375, "completions/mean_terminated_length": 1654.8929443359375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "epoch": 1.7663551401869158, "grad_norm": 0.5527448654174805, "kl": 0.05648932047188282, "learning_rate": 1.298125e-06, "loss": -0.0162, "num_tokens": 148232224.0, "reward": 1.4668110609054565, "reward_std": 0.07707585394382477, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4787157475948334, "rewards/correct_reward_func/std": 0.13740520179271698, "step": 1134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 1582.9405517578125, "completions/mean_terminated_length": 1582.9405517578125, "completions/min_length": 1104.0, "completions/min_terminated_length": 1104.0, "epoch": 1.7679127725856698, "grad_norm": 0.6102333664894104, "kl": 0.05320485308766365, "learning_rate": 1.2975e-06, "loss": 0.0112, "num_tokens": 148371059.0, "reward": 1.4848796129226685, "reward_std": 0.09365906566381454, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4967842698097229, "rewards/correct_reward_func/std": 0.1526029109954834, "step": 1135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2439.0, "completions/max_terminated_length": 2439.0, "completions/mean_length": 1627.8690185546875, "completions/mean_terminated_length": 1627.8690185546875, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 1.7694704049844237, "grad_norm": 0.5520922541618347, "kl": 0.05490568093955517, "learning_rate": 1.296875e-06, "loss": -0.013, "num_tokens": 148513674.0, "reward": 1.4324344396591187, "reward_std": 0.14724348485469818, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.4681486487388611, "rewards/correct_reward_func/std": 0.15150687098503113, "step": 1136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2219.0, "completions/max_terminated_length": 2219.0, "completions/mean_length": 1579.15478515625, "completions/mean_terminated_length": 1579.15478515625, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "epoch": 1.7710280373831777, "grad_norm": 0.5579913258552551, "kl": 0.05492355860769749, "learning_rate": 1.2962499999999999e-06, "loss": 0.0176, "num_tokens": 148652287.0, "reward": 1.5031119585037231, "reward_std": 0.07227717339992523, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5031118988990784, "rewards/correct_reward_func/std": 0.1397542804479599, "step": 1137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2666.0, "completions/max_terminated_length": 2666.0, "completions/mean_length": 1558.96435546875, "completions/mean_terminated_length": 1558.96435546875, "completions/min_length": 962.0, "completions/min_terminated_length": 962.0, "epoch": 1.7725856697819315, "grad_norm": 0.5811904072761536, "kl": 0.05289284326136112, "learning_rate": 1.295625e-06, "loss": 0.0216, "num_tokens": 148789204.0, "reward": 1.4740662574768066, "reward_std": 0.07208415865898132, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47406622767448425, "rewards/correct_reward_func/std": 0.19461053609848022, "step": 1138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 1559.2261962890625, "completions/mean_terminated_length": 1559.2261962890625, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 1.7741433021806854, "grad_norm": 0.5912863612174988, "kl": 0.05293158255517483, "learning_rate": 1.2949999999999999e-06, "loss": 0.0079, "num_tokens": 148926251.0, "reward": 1.5115501880645752, "reward_std": 0.0770222395658493, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5234548449516296, "rewards/correct_reward_func/std": 0.1550011783838272, "step": 1139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 1508.84521484375, "completions/mean_terminated_length": 1508.84521484375, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 1.7757009345794392, "grad_norm": 0.6117761731147766, "kl": 0.05457943677902222, "learning_rate": 1.294375e-06, "loss": 0.0173, "num_tokens": 149059018.0, "reward": 1.4866890907287598, "reward_std": 0.05505378171801567, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48668912053108215, "rewards/correct_reward_func/std": 0.1755112260580063, "step": 1140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1585.5357666015625, "completions/mean_terminated_length": 1585.5357666015625, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 1.777258566978193, "grad_norm": 0.6276149749755859, "kl": 0.05582164414227009, "learning_rate": 1.29375e-06, "loss": 0.0168, "num_tokens": 149198179.0, "reward": 1.429378867149353, "reward_std": 0.06529611349105835, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42937877774238586, "rewards/correct_reward_func/std": 0.11664937436580658, "step": 1141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 1539.9761962890625, "completions/mean_terminated_length": 1539.9761962890625, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "epoch": 1.778816199376947, "grad_norm": 0.5955563187599182, "kl": 0.05210274085402489, "learning_rate": 1.293125e-06, "loss": 0.0268, "num_tokens": 149333639.0, "reward": 1.4899613857269287, "reward_std": 0.07460404187440872, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.489961177110672, "rewards/correct_reward_func/std": 0.1342015117406845, "step": 1142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2353.0, "completions/max_terminated_length": 2353.0, "completions/mean_length": 1504.2857666015625, "completions/mean_terminated_length": 1504.2857666015625, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 1.780373831775701, "grad_norm": 0.6213486790657043, "kl": 0.05471382103860378, "learning_rate": 1.2925e-06, "loss": -0.0085, "num_tokens": 149465927.0, "reward": 1.5641454458236694, "reward_std": 0.07356414198875427, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5641455054283142, "rewards/correct_reward_func/std": 0.13944335281848907, "step": 1143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2212.0, "completions/max_terminated_length": 2212.0, "completions/mean_length": 1488.3333740234375, "completions/mean_terminated_length": 1488.3333740234375, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "epoch": 1.7819314641744548, "grad_norm": 0.5873425602912903, "kl": 0.054566334933042526, "learning_rate": 1.291875e-06, "loss": -0.0007, "num_tokens": 149596923.0, "reward": 1.5882827043533325, "reward_std": 0.08624503761529922, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5882828235626221, "rewards/correct_reward_func/std": 0.14811742305755615, "step": 1144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2588.0, "completions/mean_length": 1580.0714111328125, "completions/mean_terminated_length": 1500.4095458984375, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 1.7834890965732089, "grad_norm": 0.5539507865905762, "kl": 0.05389244481921196, "learning_rate": 1.29125e-06, "loss": 0.0546, "num_tokens": 149735451.0, "reward": 1.4518744945526123, "reward_std": 0.07488683611154556, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.451874315738678, "rewards/correct_reward_func/std": 0.16610823571681976, "step": 1145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2628.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 1472.7381591796875, "completions/mean_terminated_length": 1472.7381591796875, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 1.7850467289719627, "grad_norm": 0.6031122207641602, "kl": 0.05361813306808472, "learning_rate": 1.2906249999999998e-06, "loss": 0.0257, "num_tokens": 149865137.0, "reward": 1.4697390794754028, "reward_std": 0.05983883515000343, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46973901987075806, "rewards/correct_reward_func/std": 0.14120249450206757, "step": 1146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 1473.3333740234375, "completions/mean_terminated_length": 1473.3333740234375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 1.7866043613707165, "grad_norm": 0.5901123285293579, "kl": 0.055695297196507454, "learning_rate": 1.29e-06, "loss": 0.0496, "num_tokens": 149994807.0, "reward": 1.4576581716537476, "reward_std": 0.07900339365005493, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.457658052444458, "rewards/correct_reward_func/std": 0.15341699123382568, "step": 1147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2234.0, "completions/max_terminated_length": 2234.0, "completions/mean_length": 1536.59521484375, "completions/mean_terminated_length": 1536.59521484375, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 1.7881619937694704, "grad_norm": 0.6163697838783264, "kl": 0.054377732798457146, "learning_rate": 1.2893749999999998e-06, "loss": -0.0007, "num_tokens": 150129911.0, "reward": 1.4904532432556152, "reward_std": 0.06309396773576736, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49045321345329285, "rewards/correct_reward_func/std": 0.19417829811573029, "step": 1148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1408.1309814453125, "completions/mean_terminated_length": 1408.1309814453125, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 1.7897196261682242, "grad_norm": 0.6073015332221985, "kl": 0.057983702048659325, "learning_rate": 1.28875e-06, "loss": 0.002, "num_tokens": 150253948.0, "reward": 1.522030234336853, "reward_std": 0.09770264476537704, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5339348912239075, "rewards/correct_reward_func/std": 0.14656081795692444, "step": 1149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2171.0, "completions/max_terminated_length": 2171.0, "completions/mean_length": 1421.511962890625, "completions/mean_terminated_length": 1421.511962890625, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "epoch": 1.791277258566978, "grad_norm": 0.6110856533050537, "kl": 0.05523865297436714, "learning_rate": 1.2881249999999998e-06, "loss": 0.0303, "num_tokens": 150379247.0, "reward": 1.4607127904891968, "reward_std": 0.07341668009757996, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4607127010822296, "rewards/correct_reward_func/std": 0.14644268155097961, "step": 1150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2075.0, "completions/max_terminated_length": 2075.0, "completions/mean_length": 1451.1785888671875, "completions/mean_terminated_length": 1451.1785888671875, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 1.7928348909657321, "grad_norm": 0.6167957782745361, "kl": 0.05495339445769787, "learning_rate": 1.2875e-06, "loss": 0.0227, "num_tokens": 150507230.0, "reward": 1.5653648376464844, "reward_std": 0.06506835669279099, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.56536465883255, "rewards/correct_reward_func/std": 0.16461879014968872, "step": 1151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 1453.5714111328125, "completions/mean_terminated_length": 1453.5714111328125, "completions/min_length": 1014.0, "completions/min_terminated_length": 1014.0, "epoch": 1.794392523364486, "grad_norm": 0.6006103754043579, "kl": 0.05619592405855656, "learning_rate": 1.2868749999999999e-06, "loss": 0.004, "num_tokens": 150635276.0, "reward": 1.451198935508728, "reward_std": 0.06814772635698318, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45119890570640564, "rewards/correct_reward_func/std": 0.116157166659832, "step": 1152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2186.0, "completions/max_terminated_length": 2186.0, "completions/mean_length": 1429.7261962890625, "completions/mean_terminated_length": 1429.7261962890625, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 1.79595015576324, "grad_norm": 0.6754088401794434, "kl": 0.05639287270605564, "learning_rate": 1.2862499999999998e-06, "loss": 0.0207, "num_tokens": 150761409.0, "reward": 1.4789947271347046, "reward_std": 0.07913509011268616, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4908995032310486, "rewards/correct_reward_func/std": 0.12479308992624283, "step": 1153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2212.0, "completions/max_terminated_length": 2212.0, "completions/mean_length": 1413.6905517578125, "completions/mean_terminated_length": 1413.6905517578125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 1.7975077881619939, "grad_norm": 0.6476159691810608, "kl": 0.057286160066723824, "learning_rate": 1.2856249999999999e-06, "loss": 0.0176, "num_tokens": 150886009.0, "reward": 1.5201369524002075, "reward_std": 0.11201467365026474, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5320415496826172, "rewards/correct_reward_func/std": 0.13601742684841156, "step": 1154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1416.4405517578125, "completions/mean_terminated_length": 1416.4405517578125, "completions/min_length": 941.0, "completions/min_terminated_length": 941.0, "epoch": 1.7990654205607477, "grad_norm": 0.6427170038223267, "kl": 0.055769408121705055, "learning_rate": 1.2849999999999998e-06, "loss": 0.0287, "num_tokens": 151010870.0, "reward": 1.475823998451233, "reward_std": 0.06063052639365196, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47582390904426575, "rewards/correct_reward_func/std": 0.14082695543766022, "step": 1155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 1376.797607421875, "completions/mean_terminated_length": 1376.797607421875, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 1.8006230529595015, "grad_norm": 0.651973307132721, "kl": 0.054482584819197655, "learning_rate": 1.284375e-06, "loss": 0.0207, "num_tokens": 151132359.0, "reward": 1.3592936992645264, "reward_std": 0.049571361392736435, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.359293669462204, "rewards/correct_reward_func/std": 0.09493573009967804, "step": 1156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1380.9881591796875, "completions/mean_terminated_length": 1380.9881591796875, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 1.8021806853582554, "grad_norm": 0.5955179929733276, "kl": 0.05535360611975193, "learning_rate": 1.28375e-06, "loss": 0.0307, "num_tokens": 151254284.0, "reward": 1.5542653799057007, "reward_std": 0.05647812783718109, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5542653203010559, "rewards/correct_reward_func/std": 0.13479846715927124, "step": 1157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2516.0, "completions/max_terminated_length": 2516.0, "completions/mean_length": 1367.7857666015625, "completions/mean_terminated_length": 1367.7857666015625, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 1.8037383177570092, "grad_norm": 0.6157006621360779, "kl": 0.05585562810301781, "learning_rate": 1.283125e-06, "loss": 0.0104, "num_tokens": 151374968.0, "reward": 1.4527347087860107, "reward_std": 0.04634414240717888, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4527346193790436, "rewards/correct_reward_func/std": 0.14518789947032928, "step": 1158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2222.0, "completions/max_terminated_length": 2222.0, "completions/mean_length": 1361.25, "completions/mean_terminated_length": 1361.25, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 1.8052959501557633, "grad_norm": 0.6035506129264832, "kl": 0.05575713329017162, "learning_rate": 1.2825e-06, "loss": 0.0043, "num_tokens": 151495205.0, "reward": 1.5081394910812378, "reward_std": 0.09966874122619629, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.520044207572937, "rewards/correct_reward_func/std": 0.1669865995645523, "step": 1159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2446.0, "completions/max_terminated_length": 2446.0, "completions/mean_length": 1383.40478515625, "completions/mean_terminated_length": 1383.40478515625, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 1.8068535825545171, "grad_norm": 0.5899766683578491, "kl": 0.055932315066456795, "learning_rate": 1.2818750000000001e-06, "loss": -0.0176, "num_tokens": 151617435.0, "reward": 1.5422968864440918, "reward_std": 0.07693206518888474, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5422968864440918, "rewards/correct_reward_func/std": 0.18273551762104034, "step": 1160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 1379.952392578125, "completions/mean_terminated_length": 1379.952392578125, "completions/min_length": 941.0, "completions/min_terminated_length": 941.0, "epoch": 1.8084112149532712, "grad_norm": 0.6354824900627136, "kl": 0.05722059868276119, "learning_rate": 1.28125e-06, "loss": -0.0138, "num_tokens": 151739441.0, "reward": 1.47968327999115, "reward_std": 0.06953909248113632, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47968319058418274, "rewards/correct_reward_func/std": 0.13530801236629486, "step": 1161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3506.0, "completions/max_terminated_length": 3506.0, "completions/mean_length": 1438.202392578125, "completions/mean_terminated_length": 1438.202392578125, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 1.809968847352025, "grad_norm": 0.618902862071991, "kl": 0.05636182054877281, "learning_rate": 1.280625e-06, "loss": -0.0052, "num_tokens": 151866064.0, "reward": 1.4654076099395752, "reward_std": 0.05225364491343498, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46540752053260803, "rewards/correct_reward_func/std": 0.16426697373390198, "step": 1162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1380.84521484375, "completions/mean_terminated_length": 1380.84521484375, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 1.8115264797507789, "grad_norm": 0.6325119137763977, "kl": 0.057295599952340126, "learning_rate": 1.28e-06, "loss": 0.0063, "num_tokens": 151987893.0, "reward": 1.4614249467849731, "reward_std": 0.08124567568302155, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46142497658729553, "rewards/correct_reward_func/std": 0.15714338421821594, "step": 1163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 1522.8809814453125, "completions/mean_terminated_length": 1442.530029296875, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 1.8130841121495327, "grad_norm": 0.5904710292816162, "kl": 0.0530658233910799, "learning_rate": 1.279375e-06, "loss": 0.0394, "num_tokens": 152121893.0, "reward": 1.4404704570770264, "reward_std": 0.06553060561418533, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44047027826309204, "rewards/correct_reward_func/std": 0.18878044188022614, "step": 1164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2015.0, "completions/max_terminated_length": 2015.0, "completions/mean_length": 1383.3809814453125, "completions/mean_terminated_length": 1383.3809814453125, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 1.8146417445482865, "grad_norm": 0.6558542251586914, "kl": 0.054837485775351524, "learning_rate": 1.27875e-06, "loss": 0.0269, "num_tokens": 152244097.0, "reward": 1.4359396696090698, "reward_std": 0.058611880987882614, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43593963980674744, "rewards/correct_reward_func/std": 0.1572636216878891, "step": 1165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 1347.25, "completions/mean_terminated_length": 1347.25, "completions/min_length": 685.0, "completions/min_terminated_length": 685.0, "epoch": 1.8161993769470404, "grad_norm": 0.6130359768867493, "kl": 0.05522622726857662, "learning_rate": 1.278125e-06, "loss": 0.008, "num_tokens": 152363170.0, "reward": 1.4510539770126343, "reward_std": 0.06221471726894379, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4510539770126343, "rewards/correct_reward_func/std": 0.09577855467796326, "step": 1166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 1393.142822265625, "completions/mean_terminated_length": 1393.142822265625, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 1.8177570093457944, "grad_norm": 0.6263551115989685, "kl": 0.05463282763957977, "learning_rate": 1.2775e-06, "loss": 0.0061, "num_tokens": 152486248.0, "reward": 1.5984073877334595, "reward_std": 0.08862313628196716, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.6103121042251587, "rewards/correct_reward_func/std": 0.14181381464004517, "step": 1167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 1409.916748046875, "completions/mean_terminated_length": 1409.916748046875, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 1.8193146417445483, "grad_norm": 0.5975865125656128, "kl": 0.06005329266190529, "learning_rate": 1.276875e-06, "loss": 0.0538, "num_tokens": 152610645.0, "reward": 1.3848499059677124, "reward_std": 0.09300456941127777, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.3967547118663788, "rewards/correct_reward_func/std": 0.14781832695007324, "step": 1168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1374.297607421875, "completions/mean_terminated_length": 1374.297607421875, "completions/min_length": 711.0, "completions/min_terminated_length": 711.0, "epoch": 1.8208722741433023, "grad_norm": 0.6153602600097656, "kl": 0.05698695592582226, "learning_rate": 1.27625e-06, "loss": -0.0015, "num_tokens": 152732122.0, "reward": 1.4996377229690552, "reward_std": 0.05012006685137749, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4996376931667328, "rewards/correct_reward_func/std": 0.18587073683738708, "step": 1169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2416.0, "completions/max_terminated_length": 2416.0, "completions/mean_length": 1437.65478515625, "completions/mean_terminated_length": 1437.65478515625, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 1.8224299065420562, "grad_norm": 0.6363869309425354, "kl": 0.05927002243697643, "learning_rate": 1.275625e-06, "loss": -0.0087, "num_tokens": 152859029.0, "reward": 1.445292592048645, "reward_std": 0.06113346666097641, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44529247283935547, "rewards/correct_reward_func/std": 0.14021767675876617, "step": 1170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1998.0, "completions/max_terminated_length": 1998.0, "completions/mean_length": 1407.5357666015625, "completions/mean_terminated_length": 1407.5357666015625, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 1.82398753894081, "grad_norm": 0.6012924909591675, "kl": 0.05573999509215355, "learning_rate": 1.2749999999999999e-06, "loss": -0.0327, "num_tokens": 152983232.0, "reward": 1.4919466972351074, "reward_std": 0.0752594918012619, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5038514137268066, "rewards/correct_reward_func/std": 0.14708462357521057, "step": 1171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2116.0, "completions/max_terminated_length": 2116.0, "completions/mean_length": 1389.0357666015625, "completions/mean_terminated_length": 1389.0357666015625, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 1.8255451713395638, "grad_norm": 0.6216912269592285, "kl": 0.05775889381766319, "learning_rate": 1.274375e-06, "loss": 0.0057, "num_tokens": 153105833.0, "reward": 1.448509931564331, "reward_std": 0.09076011925935745, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4604146182537079, "rewards/correct_reward_func/std": 0.15207520127296448, "step": 1172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 1566.0833740234375, "completions/mean_terminated_length": 1486.2529296875, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "epoch": 1.8271028037383177, "grad_norm": 0.6418584585189819, "kl": 0.056966137140989304, "learning_rate": 1.2737499999999999e-06, "loss": 0.0723, "num_tokens": 153243600.0, "reward": 1.4256501197814941, "reward_std": 0.13713152706623077, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.4613642394542694, "rewards/correct_reward_func/std": 0.12592431902885437, "step": 1173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1864.0, "completions/max_terminated_length": 1864.0, "completions/mean_length": 1373.3333740234375, "completions/mean_terminated_length": 1373.3333740234375, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 1.8286604361370715, "grad_norm": 0.6033210158348083, "kl": 0.05552567541599274, "learning_rate": 1.273125e-06, "loss": 0.0017, "num_tokens": 153364864.0, "reward": 1.5564086437225342, "reward_std": 0.06117241829633713, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5564085245132446, "rewards/correct_reward_func/std": 0.25511133670806885, "step": 1174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2167.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 1444.011962890625, "completions/mean_terminated_length": 1444.011962890625, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 1.8302180685358256, "grad_norm": 0.6121575832366943, "kl": 0.05822913348674774, "learning_rate": 1.2724999999999999e-06, "loss": -0.0041, "num_tokens": 153492143.0, "reward": 1.4254029989242554, "reward_std": 0.058368951082229614, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4254028797149658, "rewards/correct_reward_func/std": 0.13294129073619843, "step": 1175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2496.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 1406.107177734375, "completions/mean_terminated_length": 1406.107177734375, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 1.8317757009345794, "grad_norm": 0.594601571559906, "kl": 0.06027892231941223, "learning_rate": 1.271875e-06, "loss": 0.0144, "num_tokens": 153616328.0, "reward": 1.501191258430481, "reward_std": 0.0725860446691513, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5130959749221802, "rewards/correct_reward_func/std": 0.19199717044830322, "step": 1176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 1428.40478515625, "completions/mean_terminated_length": 1428.40478515625, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 1.8333333333333335, "grad_norm": 0.7069098949432373, "kl": 0.06594429723918438, "learning_rate": 1.27125e-06, "loss": -0.0065, "num_tokens": 153742326.0, "reward": 1.5042567253112793, "reward_std": 0.06949569284915924, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5042566061019897, "rewards/correct_reward_func/std": 0.18842458724975586, "step": 1177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 1406.2261962890625, "completions/mean_terminated_length": 1406.2261962890625, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 1.8348909657320873, "grad_norm": 0.6675245761871338, "kl": 0.06320172734558582, "learning_rate": 1.2706249999999998e-06, "loss": -0.0124, "num_tokens": 153866281.0, "reward": 1.5121536254882812, "reward_std": 0.07399003207683563, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5121535062789917, "rewards/correct_reward_func/std": 0.1562425196170807, "step": 1178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 1445.0595703125, "completions/mean_terminated_length": 1445.0595703125, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "epoch": 1.8364485981308412, "grad_norm": 0.6060044765472412, "kl": 0.05702594108879566, "learning_rate": 1.27e-06, "loss": -0.0036, "num_tokens": 153993642.0, "reward": 1.462285041809082, "reward_std": 0.11445801705121994, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4860943853855133, "rewards/correct_reward_func/std": 0.17760923504829407, "step": 1179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 1432.84521484375, "completions/mean_terminated_length": 1432.84521484375, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 1.838006230529595, "grad_norm": 0.627662181854248, "kl": 0.054615674540400505, "learning_rate": 1.2693749999999998e-06, "loss": 0.0157, "num_tokens": 154120325.0, "reward": 1.4728143215179443, "reward_std": 0.05926068127155304, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4728142023086548, "rewards/correct_reward_func/std": 0.14550641179084778, "step": 1180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 1527.7261962890625, "completions/mean_terminated_length": 1447.4337158203125, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 1.8395638629283488, "grad_norm": 0.572921633720398, "kl": 0.052376220002770424, "learning_rate": 1.26875e-06, "loss": 0.1122, "num_tokens": 154254690.0, "reward": 1.539319396018982, "reward_std": 0.10069175064563751, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5512241125106812, "rewards/correct_reward_func/std": 0.1705499291419983, "step": 1181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5922.0, "completions/max_terminated_length": 5922.0, "completions/mean_length": 1456.7381591796875, "completions/mean_terminated_length": 1456.7381591796875, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 1.8411214953271027, "grad_norm": 0.6075485348701477, "kl": 0.05358008295297623, "learning_rate": 1.2681249999999998e-06, "loss": 0.058, "num_tokens": 154382966.0, "reward": 1.5374678373336792, "reward_std": 0.07980293035507202, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5374678373336792, "rewards/correct_reward_func/std": 0.21153266727924347, "step": 1182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 1510.40478515625, "completions/mean_terminated_length": 1429.903564453125, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 1.8426791277258567, "grad_norm": 0.5662770867347717, "kl": 0.051361970603466034, "learning_rate": 1.2675e-06, "loss": 0.0438, "num_tokens": 154515756.0, "reward": 1.5456972122192383, "reward_std": 0.06920914351940155, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5456972122192383, "rewards/correct_reward_func/std": 0.2079532891511917, "step": 1183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 1420.3690185546875, "completions/mean_terminated_length": 1420.3690185546875, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 1.8442367601246106, "grad_norm": 0.6576765179634094, "kl": 0.05303676053881645, "learning_rate": 1.2668749999999998e-06, "loss": 0.0123, "num_tokens": 154641061.0, "reward": 1.5421717166900635, "reward_std": 0.06544462591409683, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5421715974807739, "rewards/correct_reward_func/std": 0.17146830260753632, "step": 1184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 1409.261962890625, "completions/mean_terminated_length": 1409.261962890625, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 1.8457943925233646, "grad_norm": 0.6730476021766663, "kl": 0.05465093068778515, "learning_rate": 1.26625e-06, "loss": 0.0243, "num_tokens": 154765151.0, "reward": 1.4848014116287231, "reward_std": 0.07637952268123627, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4848015308380127, "rewards/correct_reward_func/std": 0.16643813252449036, "step": 1185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 1448.107177734375, "completions/mean_terminated_length": 1448.107177734375, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 1.8473520249221185, "grad_norm": 0.5894937515258789, "kl": 0.05262916907668114, "learning_rate": 1.2656249999999998e-06, "loss": 0.0098, "num_tokens": 154892708.0, "reward": 1.5663570165634155, "reward_std": 0.07413491606712341, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5663569569587708, "rewards/correct_reward_func/std": 0.13592872023582458, "step": 1186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2464.0, "completions/max_terminated_length": 2464.0, "completions/mean_length": 1479.4405517578125, "completions/mean_terminated_length": 1479.4405517578125, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 1.8489096573208723, "grad_norm": 0.5858403444290161, "kl": 0.0534311905503273, "learning_rate": 1.2649999999999997e-06, "loss": -0.0121, "num_tokens": 155022933.0, "reward": 1.546149492263794, "reward_std": 0.07733223587274551, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.546149492263794, "rewards/correct_reward_func/std": 0.15818926692008972, "step": 1187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 1475.4881591796875, "completions/mean_terminated_length": 1475.4881591796875, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 1.8504672897196262, "grad_norm": 0.5578842759132385, "kl": 0.053570035845041275, "learning_rate": 1.264375e-06, "loss": 0.0119, "num_tokens": 155152874.0, "reward": 1.475271463394165, "reward_std": 0.0868200734257698, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4871760904788971, "rewards/correct_reward_func/std": 0.17083941400051117, "step": 1188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2628.0, "completions/mean_length": 1580.797607421875, "completions/mean_terminated_length": 1501.14453125, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 1.85202492211838, "grad_norm": 0.5616086721420288, "kl": 0.053032904863357544, "learning_rate": 1.26375e-06, "loss": 0.0594, "num_tokens": 155291889.0, "reward": 1.4989678859710693, "reward_std": 0.09492374211549759, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5108726024627686, "rewards/correct_reward_func/std": 0.15740281343460083, "step": 1189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 1429.6309814453125, "completions/mean_terminated_length": 1429.6309814453125, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 1.8535825545171338, "grad_norm": 0.5990777015686035, "kl": 0.055983515456318855, "learning_rate": 1.263125e-06, "loss": -0.0068, "num_tokens": 155418146.0, "reward": 1.5067741870880127, "reward_std": 0.06859349459409714, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5067741274833679, "rewards/correct_reward_func/std": 0.14450393617153168, "step": 1190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1952.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1412.1785888671875, "completions/mean_terminated_length": 1412.1785888671875, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 1.855140186915888, "grad_norm": 0.6955941319465637, "kl": 0.05810222774744034, "learning_rate": 1.2625e-06, "loss": 0.0041, "num_tokens": 155542823.0, "reward": 1.4718637466430664, "reward_std": 0.1663665622472763, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.5194827318191528, "rewards/correct_reward_func/std": 0.13893841207027435, "step": 1191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 1450.357177734375, "completions/mean_terminated_length": 1450.357177734375, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "epoch": 1.8566978193146417, "grad_norm": 0.6135053634643555, "kl": 0.054131994023919106, "learning_rate": 1.261875e-06, "loss": 0.019, "num_tokens": 155670797.0, "reward": 1.4821985960006714, "reward_std": 0.07913772016763687, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4821985363960266, "rewards/correct_reward_func/std": 0.1432725340127945, "step": 1192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1534.3214111328125, "completions/mean_terminated_length": 1534.3214111328125, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 1.8582554517133958, "grad_norm": 0.5876079201698303, "kl": 0.05327623710036278, "learning_rate": 1.26125e-06, "loss": -0.0101, "num_tokens": 155805908.0, "reward": 1.4201315641403198, "reward_std": 0.13506700098514557, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4439409375190735, "rewards/correct_reward_func/std": 0.1291300356388092, "step": 1193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2172.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 1447.166748046875, "completions/mean_terminated_length": 1447.166748046875, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 1.8598130841121496, "grad_norm": 0.6002687811851501, "kl": 0.05455399490892887, "learning_rate": 1.2606250000000001e-06, "loss": 0.0051, "num_tokens": 155933566.0, "reward": 1.5439289808273315, "reward_std": 0.09578830003738403, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5439289808273315, "rewards/correct_reward_func/std": 0.14296294748783112, "step": 1194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2507.0, "completions/max_terminated_length": 2507.0, "completions/mean_length": 1492.15478515625, "completions/mean_terminated_length": 1492.15478515625, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 1.8613707165109035, "grad_norm": 0.5409152507781982, "kl": 0.054493218660354614, "learning_rate": 1.26e-06, "loss": -0.0157, "num_tokens": 156064931.0, "reward": 1.426120400428772, "reward_std": 0.08042621612548828, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4380251467227936, "rewards/correct_reward_func/std": 0.08640678226947784, "step": 1195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2150.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 1400.2261962890625, "completions/mean_terminated_length": 1400.2261962890625, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 1.8629283489096573, "grad_norm": 0.6744022965431213, "kl": 0.05779414065182209, "learning_rate": 1.259375e-06, "loss": 0.0071, "num_tokens": 156188424.0, "reward": 1.4028637409210205, "reward_std": 0.06023504585027695, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4028637409210205, "rewards/correct_reward_func/std": 0.17682860791683197, "step": 1196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2215.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 1373.1190185546875, "completions/mean_terminated_length": 1373.1190185546875, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 1.8644859813084111, "grad_norm": 0.6175127625465393, "kl": 0.054437488317489624, "learning_rate": 1.25875e-06, "loss": 0.0268, "num_tokens": 156309808.0, "reward": 1.5614629983901978, "reward_std": 0.062259506434202194, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.561462938785553, "rewards/correct_reward_func/std": 0.16409927606582642, "step": 1197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 1511.7381591796875, "completions/mean_terminated_length": 1511.7381591796875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 1.866043613707165, "grad_norm": 0.6167406439781189, "kl": 0.05430261790752411, "learning_rate": 1.258125e-06, "loss": 0.002, "num_tokens": 156442920.0, "reward": 1.5209169387817383, "reward_std": 0.0793665200471878, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5209168195724487, "rewards/correct_reward_func/std": 0.14757879078388214, "step": 1198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 1455.21435546875, "completions/mean_terminated_length": 1455.21435546875, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 1.867601246105919, "grad_norm": 0.6312068700790405, "kl": 0.05524124763906002, "learning_rate": 1.2575e-06, "loss": -0.0236, "num_tokens": 156571278.0, "reward": 1.5241351127624512, "reward_std": 0.05386820435523987, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5241349935531616, "rewards/correct_reward_func/std": 0.095490962266922, "step": 1199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 1487.1429443359375, "completions/mean_terminated_length": 1487.1429443359375, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 1.8691588785046729, "grad_norm": 0.5886282324790955, "kl": 0.055202048271894455, "learning_rate": 1.256875e-06, "loss": 0.0174, "num_tokens": 156702252.0, "reward": 1.4928206205368042, "reward_std": 0.06394536048173904, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49282047152519226, "rewards/correct_reward_func/std": 0.13995295763015747, "step": 1200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2050.0, "completions/max_terminated_length": 2050.0, "completions/mean_length": 1411.1309814453125, "completions/mean_terminated_length": 1411.1309814453125, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 1.870716510903427, "grad_norm": 0.6019092202186584, "kl": 0.053061360493302345, "learning_rate": 1.25625e-06, "loss": -0.012, "num_tokens": 156826667.0, "reward": 1.4709663391113281, "reward_std": 0.09214991331100464, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48287099599838257, "rewards/correct_reward_func/std": 0.11982110142707825, "step": 1201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1448.3809814453125, "completions/mean_terminated_length": 1448.3809814453125, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 1.8722741433021808, "grad_norm": 0.6398613452911377, "kl": 0.053339963778853416, "learning_rate": 1.255625e-06, "loss": -0.0057, "num_tokens": 156954319.0, "reward": 1.5515879392623901, "reward_std": 0.06445462256669998, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5634927153587341, "rewards/correct_reward_func/std": 0.10669108480215073, "step": 1202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 1510.5238037109375, "completions/mean_terminated_length": 1430.0240478515625, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "epoch": 1.8738317757009346, "grad_norm": 0.6044386625289917, "kl": 0.0528288371860981, "learning_rate": 1.2549999999999998e-06, "loss": 0.0454, "num_tokens": 157087161.0, "reward": 1.4499683380126953, "reward_std": 0.06496580690145493, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44996827840805054, "rewards/correct_reward_func/std": 0.16170236468315125, "step": 1203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2460.0, "completions/max_terminated_length": 2460.0, "completions/mean_length": 1443.9761962890625, "completions/mean_terminated_length": 1443.9761962890625, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 1.8753894080996885, "grad_norm": 0.6295816898345947, "kl": 0.05467262864112854, "learning_rate": 1.254375e-06, "loss": -0.0112, "num_tokens": 157214269.0, "reward": 1.484448790550232, "reward_std": 0.09725934267044067, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49635347723960876, "rewards/correct_reward_func/std": 0.1546335369348526, "step": 1204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2348.0, "completions/max_terminated_length": 2348.0, "completions/mean_length": 1463.2857666015625, "completions/mean_terminated_length": 1463.2857666015625, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 1.8769470404984423, "grad_norm": 0.6658309698104858, "kl": 0.05453195050358772, "learning_rate": 1.2537499999999999e-06, "loss": 0.0132, "num_tokens": 157343143.0, "reward": 1.4912623167037964, "reward_std": 0.04655470326542854, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4912622570991516, "rewards/correct_reward_func/std": 0.1253512054681778, "step": 1205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 1395.34521484375, "completions/mean_terminated_length": 1395.34521484375, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "epoch": 1.8785046728971961, "grad_norm": 0.6738154292106628, "kl": 0.05441772937774658, "learning_rate": 1.253125e-06, "loss": 0.0046, "num_tokens": 157466130.0, "reward": 1.4852144718170166, "reward_std": 0.09451096504926682, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4971192181110382, "rewards/correct_reward_func/std": 0.10943647474050522, "step": 1206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 1500.7381591796875, "completions/mean_terminated_length": 1500.7381591796875, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 1.8800623052959502, "grad_norm": 0.6398898959159851, "kl": 0.055941881611943245, "learning_rate": 1.2524999999999999e-06, "loss": 0.0032, "num_tokens": 157598372.0, "reward": 1.4226170778274536, "reward_std": 0.05477407947182655, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4226168990135193, "rewards/correct_reward_func/std": 0.12621764838695526, "step": 1207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1449.9761962890625, "completions/mean_terminated_length": 1449.9761962890625, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 1.881619937694704, "grad_norm": 0.6029897928237915, "kl": 0.05583735927939415, "learning_rate": 1.251875e-06, "loss": -0.024, "num_tokens": 157726164.0, "reward": 1.5082427263259888, "reward_std": 0.0946372002363205, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.520147442817688, "rewards/correct_reward_func/std": 0.1646113246679306, "step": 1208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2148.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 1428.7857666015625, "completions/mean_terminated_length": 1428.7857666015625, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 1.883177570093458, "grad_norm": 0.5933884382247925, "kl": 0.055446457117795944, "learning_rate": 1.2512499999999999e-06, "loss": -0.0193, "num_tokens": 157852146.0, "reward": 1.482871174812317, "reward_std": 0.04775906354188919, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4828711450099945, "rewards/correct_reward_func/std": 0.16042080521583557, "step": 1209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1972.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1435.8929443359375, "completions/mean_terminated_length": 1435.8929443359375, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 1.884735202492212, "grad_norm": 0.6398827433586121, "kl": 0.05569705553352833, "learning_rate": 1.250625e-06, "loss": -0.0079, "num_tokens": 157978845.0, "reward": 1.544633388519287, "reward_std": 0.05590443313121796, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5446333885192871, "rewards/correct_reward_func/std": 0.10797035694122314, "step": 1210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 1491.166748046875, "completions/mean_terminated_length": 1491.166748046875, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 1.8862928348909658, "grad_norm": 0.5854122638702393, "kl": 0.05436447449028492, "learning_rate": 1.2499999999999999e-06, "loss": -0.006, "num_tokens": 158110049.0, "reward": 1.5124729871749878, "reward_std": 0.06320083886384964, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.512472927570343, "rewards/correct_reward_func/std": 0.125708669424057, "step": 1211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2717.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 1520.952392578125, "completions/mean_terminated_length": 1520.952392578125, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "epoch": 1.8878504672897196, "grad_norm": 0.5825960040092468, "kl": 0.05764823034405708, "learning_rate": 1.2493749999999998e-06, "loss": 0.0198, "num_tokens": 158243851.0, "reward": 1.4864962100982666, "reward_std": 0.06447198241949081, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48649606108665466, "rewards/correct_reward_func/std": 0.15843161940574646, "step": 1212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2143.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 1506.5714111328125, "completions/mean_terminated_length": 1506.5714111328125, "completions/min_length": 1124.0, "completions/min_terminated_length": 1124.0, "epoch": 1.8894080996884735, "grad_norm": 0.5692533850669861, "kl": 0.0551010612398386, "learning_rate": 1.24875e-06, "loss": -0.0074, "num_tokens": 158376577.0, "reward": 1.4795805215835571, "reward_std": 0.09836306422948837, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4914851486682892, "rewards/correct_reward_func/std": 0.1642414629459381, "step": 1213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4141.0, "completions/max_terminated_length": 4141.0, "completions/mean_length": 1465.5357666015625, "completions/mean_terminated_length": 1465.5357666015625, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 1.8909657320872273, "grad_norm": 0.5892350077629089, "kl": 0.05867234244942665, "learning_rate": 1.2481249999999998e-06, "loss": 0.0065, "num_tokens": 158505526.0, "reward": 1.410791277885437, "reward_std": 0.05722607299685478, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4107913076877594, "rewards/correct_reward_func/std": 0.09546013921499252, "step": 1214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 1543.8690185546875, "completions/mean_terminated_length": 1543.8690185546875, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "epoch": 1.8925233644859814, "grad_norm": 0.5391766428947449, "kl": 0.05596735142171383, "learning_rate": 1.2475e-06, "loss": 0.0095, "num_tokens": 158641259.0, "reward": 1.4469826221466064, "reward_std": 0.07032877206802368, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4588872790336609, "rewards/correct_reward_func/std": 0.14017288386821747, "step": 1215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1885.0, "completions/max_terminated_length": 1885.0, "completions/mean_length": 1428.107177734375, "completions/mean_terminated_length": 1428.107177734375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 1.8940809968847352, "grad_norm": 0.6034140586853027, "kl": 0.059058764949440956, "learning_rate": 1.2468749999999998e-06, "loss": -0.0118, "num_tokens": 158767148.0, "reward": 1.4679725170135498, "reward_std": 0.0756353810429573, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4679723381996155, "rewards/correct_reward_func/std": 0.11842022091150284, "step": 1216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 1451.1309814453125, "completions/mean_terminated_length": 1451.1309814453125, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 1.8956386292834893, "grad_norm": 0.593402624130249, "kl": 0.05814815312623978, "learning_rate": 1.24625e-06, "loss": 0.011, "num_tokens": 158894959.0, "reward": 1.4441269636154175, "reward_std": 0.07663440704345703, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4560317099094391, "rewards/correct_reward_func/std": 0.117508165538311, "step": 1217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 1597.4405517578125, "completions/mean_terminated_length": 1597.4405517578125, "completions/min_length": 1123.0, "completions/min_terminated_length": 1123.0, "epoch": 1.897196261682243, "grad_norm": 0.573035478591919, "kl": 0.05495023354887962, "learning_rate": 1.2456249999999998e-06, "loss": -0.0325, "num_tokens": 159035354.0, "reward": 1.5224978923797607, "reward_std": 0.08499068766832352, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5344026684761047, "rewards/correct_reward_func/std": 0.11581014841794968, "step": 1218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 1517.5714111328125, "completions/mean_terminated_length": 1517.5714111328125, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 1.898753894080997, "grad_norm": 0.5983966588973999, "kl": 0.058529168367385864, "learning_rate": 1.2450000000000002e-06, "loss": -0.001, "num_tokens": 159168878.0, "reward": 1.5053348541259766, "reward_std": 0.06362482905387878, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.505334734916687, "rewards/correct_reward_func/std": 0.10379531234502792, "step": 1219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3165.0, "completions/max_terminated_length": 3165.0, "completions/mean_length": 1515.011962890625, "completions/mean_terminated_length": 1515.011962890625, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 1.9003115264797508, "grad_norm": 0.5892098546028137, "kl": 0.05673946999013424, "learning_rate": 1.244375e-06, "loss": 0.0095, "num_tokens": 159302145.0, "reward": 1.5087298154830933, "reward_std": 0.05147193372249603, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5087298154830933, "rewards/correct_reward_func/std": 0.15614551305770874, "step": 1220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 1549.6905517578125, "completions/mean_terminated_length": 1549.6905517578125, "completions/min_length": 975.0, "completions/min_terminated_length": 975.0, "epoch": 1.9018691588785046, "grad_norm": 0.5687181949615479, "kl": 0.05817488580942154, "learning_rate": 1.24375e-06, "loss": -0.0056, "num_tokens": 159438481.0, "reward": 1.494850754737854, "reward_std": 0.07789193093776703, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.506755530834198, "rewards/correct_reward_func/std": 0.18507011234760284, "step": 1221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 1471.0714111328125, "completions/mean_terminated_length": 1471.0714111328125, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 1.9034267912772584, "grad_norm": 0.6105194091796875, "kl": 0.05363965407013893, "learning_rate": 1.243125e-06, "loss": -0.0094, "num_tokens": 159567877.0, "reward": 1.4625232219696045, "reward_std": 0.08769676089286804, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4744279086589813, "rewards/correct_reward_func/std": 0.14392836391925812, "step": 1222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 1528.6905517578125, "completions/mean_terminated_length": 1528.6905517578125, "completions/min_length": 1050.0, "completions/min_terminated_length": 1050.0, "epoch": 1.9049844236760125, "grad_norm": 0.6121729016304016, "kl": 0.053402090445160866, "learning_rate": 1.2425e-06, "loss": 0.0067, "num_tokens": 159702323.0, "reward": 1.562993049621582, "reward_std": 0.05727924033999443, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.562993049621582, "rewards/correct_reward_func/std": 0.15030689537525177, "step": 1223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 1473.5714111328125, "completions/mean_terminated_length": 1473.5714111328125, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 1.9065420560747663, "grad_norm": 0.6061887741088867, "kl": 0.05679184943437576, "learning_rate": 1.241875e-06, "loss": 0.0093, "num_tokens": 159831959.0, "reward": 1.5351828336715698, "reward_std": 0.05577115714550018, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5351826548576355, "rewards/correct_reward_func/std": 0.11443596333265305, "step": 1224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 1486.9285888671875, "completions/mean_terminated_length": 1486.9285888671875, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 1.9080996884735204, "grad_norm": 0.6339111924171448, "kl": 0.05756880156695843, "learning_rate": 1.24125e-06, "loss": -0.0048, "num_tokens": 159962771.0, "reward": 1.548493504524231, "reward_std": 0.06505528092384338, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5484933257102966, "rewards/correct_reward_func/std": 0.14723271131515503, "step": 1225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2512.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 1506.71435546875, "completions/mean_terminated_length": 1506.71435546875, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 1.9096573208722742, "grad_norm": 0.6125451922416687, "kl": 0.056310176849365234, "learning_rate": 1.240625e-06, "loss": 0.0381, "num_tokens": 160095407.0, "reward": 1.4650319814682007, "reward_std": 0.05291309952735901, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46503204107284546, "rewards/correct_reward_func/std": 0.17154107987880707, "step": 1226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2084.0, "completions/max_terminated_length": 2084.0, "completions/mean_length": 1484.0833740234375, "completions/mean_terminated_length": 1484.0833740234375, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 1.911214953271028, "grad_norm": 0.625715434551239, "kl": 0.05549544841051102, "learning_rate": 1.24e-06, "loss": 0.0175, "num_tokens": 160225938.0, "reward": 1.5273239612579346, "reward_std": 0.055574677884578705, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5273239612579346, "rewards/correct_reward_func/std": 0.18045419454574585, "step": 1227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 1508.7261962890625, "completions/mean_terminated_length": 1508.7261962890625, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 1.912772585669782, "grad_norm": 0.6170700192451477, "kl": 0.05653890781104565, "learning_rate": 1.2393749999999999e-06, "loss": -0.0152, "num_tokens": 160358665.0, "reward": 1.5222980976104736, "reward_std": 0.09018044173717499, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5342028737068176, "rewards/correct_reward_func/std": 0.18403998017311096, "step": 1228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 1451.511962890625, "completions/mean_terminated_length": 1451.511962890625, "completions/min_length": 1038.0, "completions/min_terminated_length": 1038.0, "epoch": 1.9143302180685358, "grad_norm": 0.6161199808120728, "kl": 0.06045283377170563, "learning_rate": 1.23875e-06, "loss": 0.0172, "num_tokens": 160486544.0, "reward": 1.504557490348816, "reward_std": 0.05878777801990509, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5045574903488159, "rewards/correct_reward_func/std": 0.17383144795894623, "step": 1229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2741.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 1467.8095703125, "completions/mean_terminated_length": 1467.8095703125, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 1.9158878504672896, "grad_norm": 0.6232401728630066, "kl": 0.05727357417345047, "learning_rate": 1.238125e-06, "loss": 0.0176, "num_tokens": 160615834.0, "reward": 1.538735032081604, "reward_std": 0.06778266280889511, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5387349128723145, "rewards/correct_reward_func/std": 0.15467937290668488, "step": 1230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 1524.166748046875, "completions/mean_terminated_length": 1524.166748046875, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 1.9174454828660437, "grad_norm": 0.5900119543075562, "kl": 0.05672660283744335, "learning_rate": 1.2375e-06, "loss": 0.0068, "num_tokens": 160750152.0, "reward": 1.463720679283142, "reward_std": 0.048480767756700516, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4637205898761749, "rewards/correct_reward_func/std": 0.14796467125415802, "step": 1231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 1446.6785888671875, "completions/mean_terminated_length": 1446.6785888671875, "completions/min_length": 623.0, "completions/min_terminated_length": 623.0, "epoch": 1.9190031152647975, "grad_norm": 0.6100647449493408, "kl": 0.05839471332728863, "learning_rate": 1.236875e-06, "loss": 0.0007, "num_tokens": 160877745.0, "reward": 1.555694580078125, "reward_std": 0.0916619524359703, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5795038938522339, "rewards/correct_reward_func/std": 0.15683145821094513, "step": 1232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 1516.0357666015625, "completions/mean_terminated_length": 1516.0357666015625, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 1.9205607476635516, "grad_norm": 0.594845712184906, "kl": 0.05445660836994648, "learning_rate": 1.23625e-06, "loss": 0.005, "num_tokens": 161011230.0, "reward": 1.5668760538101196, "reward_std": 0.09204652905464172, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5787807703018188, "rewards/correct_reward_func/std": 0.1532406061887741, "step": 1233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1503.7857666015625, "completions/mean_terminated_length": 1503.7857666015625, "completions/min_length": 999.0, "completions/min_terminated_length": 999.0, "epoch": 1.9221183800623054, "grad_norm": 0.6142722964286804, "kl": 0.056435633450746536, "learning_rate": 1.235625e-06, "loss": -0.0017, "num_tokens": 161143506.0, "reward": 1.5156687498092651, "reward_std": 0.09886422008275986, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5275734066963196, "rewards/correct_reward_func/std": 0.1786666214466095, "step": 1234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 1636.1309814453125, "completions/mean_terminated_length": 1557.14453125, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 1.9236760124610592, "grad_norm": 0.5612958073616028, "kl": 0.052957579493522644, "learning_rate": 1.235e-06, "loss": 0.0627, "num_tokens": 161287103.0, "reward": 1.4853476285934448, "reward_std": 0.06636354327201843, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4853476583957672, "rewards/correct_reward_func/std": 0.17209897935390472, "step": 1235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 1449.3690185546875, "completions/mean_terminated_length": 1449.3690185546875, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 1.925233644859813, "grad_norm": 0.6625157594680786, "kl": 0.06063521094620228, "learning_rate": 1.234375e-06, "loss": 0.0142, "num_tokens": 161414826.0, "reward": 1.5159897804260254, "reward_std": 0.06611641496419907, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5159897208213806, "rewards/correct_reward_func/std": 0.13702069222927094, "step": 1236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 1510.3095703125, "completions/mean_terminated_length": 1510.3095703125, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 1.926791277258567, "grad_norm": 0.5958049297332764, "kl": 0.0545333456248045, "learning_rate": 1.2337499999999998e-06, "loss": -0.0069, "num_tokens": 161547620.0, "reward": 1.495219349861145, "reward_std": 0.055185046046972275, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4952191710472107, "rewards/correct_reward_func/std": 0.12605080008506775, "step": 1237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2202.0, "completions/max_terminated_length": 2202.0, "completions/mean_length": 1466.3809814453125, "completions/mean_terminated_length": 1466.3809814453125, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 1.9283489096573208, "grad_norm": 0.62995845079422, "kl": 0.056595442816615105, "learning_rate": 1.233125e-06, "loss": 0.0063, "num_tokens": 161676748.0, "reward": 1.4204565286636353, "reward_std": 0.05791353061795235, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4204564690589905, "rewards/correct_reward_func/std": 0.10334562510251999, "step": 1238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1420.6190185546875, "completions/mean_terminated_length": 1420.6190185546875, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 1.9299065420560748, "grad_norm": 0.6097623109817505, "kl": 0.058360595256090164, "learning_rate": 1.2324999999999998e-06, "loss": 0.0003, "num_tokens": 161802098.0, "reward": 1.528025507926941, "reward_std": 0.06208934262394905, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5280255079269409, "rewards/correct_reward_func/std": 0.14775097370147705, "step": 1239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 1455.8214111328125, "completions/mean_terminated_length": 1455.8214111328125, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 1.9314641744548287, "grad_norm": 0.6094352006912231, "kl": 0.05603187344968319, "learning_rate": 1.231875e-06, "loss": -0.0279, "num_tokens": 161930369.0, "reward": 1.4845614433288574, "reward_std": 0.07169675827026367, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4845612943172455, "rewards/correct_reward_func/std": 0.1435069590806961, "step": 1240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2652.0, "completions/max_terminated_length": 2652.0, "completions/mean_length": 1498.6429443359375, "completions/mean_terminated_length": 1498.6429443359375, "completions/min_length": 1043.0, "completions/min_terminated_length": 1043.0, "epoch": 1.9330218068535827, "grad_norm": 0.5977336764335632, "kl": 0.05589907057583332, "learning_rate": 1.2312499999999999e-06, "loss": -0.0007, "num_tokens": 162062375.0, "reward": 1.520866870880127, "reward_std": 0.14748117327690125, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5446763634681702, "rewards/correct_reward_func/std": 0.20734147727489471, "step": 1241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2057.0, "completions/max_terminated_length": 2057.0, "completions/mean_length": 1429.4405517578125, "completions/mean_terminated_length": 1429.4405517578125, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 1.9345794392523366, "grad_norm": 0.5834234356880188, "kl": 0.05620076134800911, "learning_rate": 1.230625e-06, "loss": -0.0427, "num_tokens": 162188568.0, "reward": 1.4639087915420532, "reward_std": 0.07554743438959122, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47581353783607483, "rewards/correct_reward_func/std": 0.11057791858911514, "step": 1242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2208.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 1436.3690185546875, "completions/mean_terminated_length": 1436.3690185546875, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 1.9361370716510904, "grad_norm": 0.6232700943946838, "kl": 0.05432428978383541, "learning_rate": 1.2299999999999999e-06, "loss": -0.0101, "num_tokens": 162315085.0, "reward": 1.5685479640960693, "reward_std": 0.07263903319835663, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5685479044914246, "rewards/correct_reward_func/std": 0.16187405586242676, "step": 1243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2100.0, "completions/max_terminated_length": 2100.0, "completions/mean_length": 1487.8690185546875, "completions/mean_terminated_length": 1487.8690185546875, "completions/min_length": 453.0, "completions/min_terminated_length": 453.0, "epoch": 1.9376947040498442, "grad_norm": 0.5650731325149536, "kl": 0.0536829624325037, "learning_rate": 1.229375e-06, "loss": -0.0528, "num_tokens": 162445970.0, "reward": 1.5079346895217896, "reward_std": 0.07853532582521439, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5079345703125, "rewards/correct_reward_func/std": 0.13202406466007233, "step": 1244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1958.0, "completions/max_terminated_length": 1958.0, "completions/mean_length": 1392.9881591796875, "completions/mean_terminated_length": 1392.9881591796875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 1.939252336448598, "grad_norm": 0.5979002118110657, "kl": 0.05525914579629898, "learning_rate": 1.2287499999999999e-06, "loss": -0.0057, "num_tokens": 162568981.0, "reward": 1.5228124856948853, "reward_std": 0.0510064922273159, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5228123664855957, "rewards/correct_reward_func/std": 0.13123123347759247, "step": 1245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2077.0, "completions/max_terminated_length": 2077.0, "completions/mean_length": 1452.702392578125, "completions/mean_terminated_length": 1452.702392578125, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 1.940809968847352, "grad_norm": 0.6322863101959229, "kl": 0.057448774576187134, "learning_rate": 1.2281249999999998e-06, "loss": -0.0084, "num_tokens": 162696816.0, "reward": 1.4991294145584106, "reward_std": 0.05651399865746498, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4991292357444763, "rewards/correct_reward_func/std": 0.16178050637245178, "step": 1246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 1487.452392578125, "completions/mean_terminated_length": 1487.452392578125, "completions/min_length": 730.0, "completions/min_terminated_length": 730.0, "epoch": 1.942367601246106, "grad_norm": 0.5885341167449951, "kl": 0.054767319932579994, "learning_rate": 1.2275e-06, "loss": 0.0305, "num_tokens": 162827642.0, "reward": 1.5081366300582886, "reward_std": 0.07464636117219925, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5081365704536438, "rewards/correct_reward_func/std": 0.21267029643058777, "step": 1247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1439.4761962890625, "completions/mean_terminated_length": 1439.4761962890625, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 1.9439252336448598, "grad_norm": 0.6086589097976685, "kl": 0.053049420937895775, "learning_rate": 1.2268749999999998e-06, "loss": 0.0039, "num_tokens": 162954540.0, "reward": 1.4959285259246826, "reward_std": 0.04519243165850639, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4959285259246826, "rewards/correct_reward_func/std": 0.14655932784080505, "step": 1248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2562.0, "completions/max_terminated_length": 2562.0, "completions/mean_length": 1605.40478515625, "completions/mean_terminated_length": 1605.40478515625, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "epoch": 1.9454828660436139, "grad_norm": 0.5914570093154907, "kl": 0.053994106128811836, "learning_rate": 1.22625e-06, "loss": -0.0055, "num_tokens": 163095520.0, "reward": 1.461374044418335, "reward_std": 0.07442204654216766, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.473278671503067, "rewards/correct_reward_func/std": 0.14128980040550232, "step": 1249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 1540.047607421875, "completions/mean_terminated_length": 1540.047607421875, "completions/min_length": 1050.0, "completions/min_terminated_length": 1050.0, "epoch": 1.9470404984423677, "grad_norm": 0.5995625257492065, "kl": 0.05544967204332352, "learning_rate": 1.2256249999999998e-06, "loss": 0.0197, "num_tokens": 163230956.0, "reward": 1.5096129179000854, "reward_std": 0.05899566411972046, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5096127390861511, "rewards/correct_reward_func/std": 0.13278187811374664, "step": 1250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2871.0, "completions/max_terminated_length": 2871.0, "completions/mean_length": 1516.25, "completions/mean_terminated_length": 1516.25, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 1.9485981308411215, "grad_norm": 0.5983135104179382, "kl": 0.05303397215902805, "learning_rate": 1.2250000000000001e-06, "loss": -0.026, "num_tokens": 163364297.0, "reward": 1.5285906791687012, "reward_std": 0.08423756808042526, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5404952764511108, "rewards/correct_reward_func/std": 0.17101401090621948, "step": 1251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2658.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 1553.0, "completions/mean_terminated_length": 1553.0, "completions/min_length": 998.0, "completions/min_terminated_length": 998.0, "epoch": 1.9501557632398754, "grad_norm": 0.5910971164703369, "kl": 0.053005462512373924, "learning_rate": 1.224375e-06, "loss": 0.0018, "num_tokens": 163500695.0, "reward": 1.551090121269226, "reward_std": 0.08731625229120255, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5629948377609253, "rewards/correct_reward_func/std": 0.16429637372493744, "step": 1252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1492.25, "completions/mean_terminated_length": 1492.25, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 1.9517133956386292, "grad_norm": 0.6033821105957031, "kl": 0.053926773369312286, "learning_rate": 1.22375e-06, "loss": 0.0229, "num_tokens": 163631990.0, "reward": 1.5334794521331787, "reward_std": 0.0960599035024643, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5453841090202332, "rewards/correct_reward_func/std": 0.10378874838352203, "step": 1253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1491.4405517578125, "completions/mean_terminated_length": 1491.4405517578125, "completions/min_length": 952.0, "completions/min_terminated_length": 952.0, "epoch": 1.953271028037383, "grad_norm": 0.6178480386734009, "kl": 0.05189335532486439, "learning_rate": 1.223125e-06, "loss": 0.0197, "num_tokens": 163763247.0, "reward": 1.5316691398620605, "reward_std": 0.05091511830687523, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5316690802574158, "rewards/correct_reward_func/std": 0.18393395841121674, "step": 1254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3009.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 1497.047607421875, "completions/mean_terminated_length": 1497.047607421875, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 1.9548286604361371, "grad_norm": 0.5547710657119751, "kl": 0.05215086415410042, "learning_rate": 1.2225e-06, "loss": 0.0187, "num_tokens": 163894987.0, "reward": 1.5297099351882935, "reward_std": 0.09183689951896667, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5416147112846375, "rewards/correct_reward_func/std": 0.121444933116436, "step": 1255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1496.857177734375, "completions/mean_terminated_length": 1496.857177734375, "completions/min_length": 981.0, "completions/min_terminated_length": 981.0, "epoch": 1.956386292834891, "grad_norm": 0.6268892288208008, "kl": 0.05400961637496948, "learning_rate": 1.221875e-06, "loss": 0.0149, "num_tokens": 164026735.0, "reward": 1.5271872282028198, "reward_std": 0.06674759089946747, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.527186930179596, "rewards/correct_reward_func/std": 0.14271174371242523, "step": 1256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 1575.3095703125, "completions/mean_terminated_length": 1575.3095703125, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "epoch": 1.957943925233645, "grad_norm": 0.5513768196105957, "kl": 0.0525702778249979, "learning_rate": 1.22125e-06, "loss": -0.0256, "num_tokens": 164165103.0, "reward": 1.420666217803955, "reward_std": 0.0835125520825386, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4325709640979767, "rewards/correct_reward_func/std": 0.12534375488758087, "step": 1257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 1447.8333740234375, "completions/mean_terminated_length": 1447.8333740234375, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 1.9595015576323989, "grad_norm": 0.6187682747840881, "kl": 0.05288996361196041, "learning_rate": 1.220625e-06, "loss": 0.019, "num_tokens": 164292643.0, "reward": 1.494939923286438, "reward_std": 0.08971309661865234, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5068445205688477, "rewards/correct_reward_func/std": 0.12182911485433578, "step": 1258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2248.0, "completions/max_terminated_length": 2248.0, "completions/mean_length": 1480.4881591796875, "completions/mean_terminated_length": 1480.4881591796875, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "epoch": 1.9610591900311527, "grad_norm": 0.5808848142623901, "kl": 0.05048451945185661, "learning_rate": 1.22e-06, "loss": -0.0149, "num_tokens": 164423034.0, "reward": 1.5767302513122559, "reward_std": 0.05737687274813652, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5767301321029663, "rewards/correct_reward_func/std": 0.16525724530220032, "step": 1259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6141.0, "completions/max_terminated_length": 6141.0, "completions/mean_length": 1530.607177734375, "completions/mean_terminated_length": 1530.607177734375, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 1.9626168224299065, "grad_norm": 0.5983599424362183, "kl": 0.052588196471333504, "learning_rate": 1.219375e-06, "loss": -0.0415, "num_tokens": 164557557.0, "reward": 1.542871356010437, "reward_std": 0.06814587861299515, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.542871356010437, "rewards/correct_reward_func/std": 0.12595169246196747, "step": 1260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1496.75, "completions/mean_terminated_length": 1496.75, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 1.9641744548286604, "grad_norm": 0.5757709741592407, "kl": 0.0547416340559721, "learning_rate": 1.21875e-06, "loss": -0.0213, "num_tokens": 164689254.0, "reward": 1.5508331060409546, "reward_std": 0.08947653323411942, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.562737762928009, "rewards/correct_reward_func/std": 0.15128713846206665, "step": 1261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 1485.297607421875, "completions/mean_terminated_length": 1485.297607421875, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 1.9657320872274142, "grad_norm": 0.5948099493980408, "kl": 0.052826663479208946, "learning_rate": 1.2181249999999999e-06, "loss": 0.0193, "num_tokens": 164819983.0, "reward": 1.4465724229812622, "reward_std": 0.06566507369279861, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4465723931789398, "rewards/correct_reward_func/std": 0.13240855932235718, "step": 1262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 1455.8214111328125, "completions/mean_terminated_length": 1455.8214111328125, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 1.9672897196261683, "grad_norm": 0.6218265891075134, "kl": 0.0531153529882431, "learning_rate": 1.2175e-06, "loss": 0.0199, "num_tokens": 164948194.0, "reward": 1.5275919437408447, "reward_std": 0.05380544811487198, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5275918245315552, "rewards/correct_reward_func/std": 0.15963366627693176, "step": 1263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1500.15478515625, "completions/mean_terminated_length": 1500.15478515625, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "epoch": 1.9688473520249221, "grad_norm": 0.5963311791419983, "kl": 0.05395059287548065, "learning_rate": 1.2168749999999999e-06, "loss": 0.001, "num_tokens": 165080015.0, "reward": 1.3872771263122559, "reward_std": 0.12041691690683365, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.42299142479896545, "rewards/correct_reward_func/std": 0.12725859880447388, "step": 1264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1492.8214111328125, "completions/mean_terminated_length": 1492.8214111328125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 1.9704049844236762, "grad_norm": 0.5740455985069275, "kl": 0.052818788215518, "learning_rate": 1.21625e-06, "loss": -0.0228, "num_tokens": 165211268.0, "reward": 1.552432656288147, "reward_std": 0.06279515475034714, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.552432656288147, "rewards/correct_reward_func/std": 0.14807192981243134, "step": 1265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2605.0, "completions/mean_length": 1572.59521484375, "completions/mean_terminated_length": 1492.84326171875, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "epoch": 1.97196261682243, "grad_norm": 0.6058183908462524, "kl": 0.05111592262983322, "learning_rate": 1.215625e-06, "loss": 0.054, "num_tokens": 165349246.0, "reward": 1.5167975425720215, "reward_std": 0.10025399178266525, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5167975425720215, "rewards/correct_reward_func/std": 0.1563318818807602, "step": 1266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2076.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 1489.6905517578125, "completions/mean_terminated_length": 1489.6905517578125, "completions/min_length": 1047.0, "completions/min_terminated_length": 1047.0, "epoch": 1.9735202492211839, "grad_norm": 0.6266090869903564, "kl": 0.05259551480412483, "learning_rate": 1.215e-06, "loss": -0.0079, "num_tokens": 165480266.0, "reward": 1.4993327856063843, "reward_std": 0.06754369288682938, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4993326663970947, "rewards/correct_reward_func/std": 0.11055982112884521, "step": 1267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 1495.7381591796875, "completions/mean_terminated_length": 1495.7381591796875, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "epoch": 1.9750778816199377, "grad_norm": 0.5864278078079224, "kl": 0.05348556116223335, "learning_rate": 1.214375e-06, "loss": 0.0038, "num_tokens": 165611854.0, "reward": 1.5049890279769897, "reward_std": 0.05775702744722366, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5049889087677002, "rewards/correct_reward_func/std": 0.14344222843647003, "step": 1268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2133.0, "completions/max_terminated_length": 2133.0, "completions/mean_length": 1492.3809814453125, "completions/mean_terminated_length": 1492.3809814453125, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 1.9766355140186915, "grad_norm": 0.6424045562744141, "kl": 0.054369064047932625, "learning_rate": 1.21375e-06, "loss": 0.035, "num_tokens": 165743226.0, "reward": 1.5482243299484253, "reward_std": 0.06635341048240662, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5482242107391357, "rewards/correct_reward_func/std": 0.1771761178970337, "step": 1269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 1477.65478515625, "completions/mean_terminated_length": 1477.65478515625, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 1.9781931464174454, "grad_norm": 0.660260796546936, "kl": 0.0529660489410162, "learning_rate": 1.213125e-06, "loss": 0.0243, "num_tokens": 165873403.0, "reward": 1.4610735177993774, "reward_std": 0.05761678144335747, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4610733687877655, "rewards/correct_reward_func/std": 0.15052658319473267, "step": 1270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2636.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 1578.4761962890625, "completions/mean_terminated_length": 1578.4761962890625, "completions/min_length": 521.0, "completions/min_terminated_length": 521.0, "epoch": 1.9797507788161994, "grad_norm": 0.5531615018844604, "kl": 0.05402318015694618, "learning_rate": 1.2124999999999998e-06, "loss": 0.0024, "num_tokens": 166012295.0, "reward": 1.5023671388626099, "reward_std": 0.05987171828746796, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5023671388626099, "rewards/correct_reward_func/std": 0.1565093696117401, "step": 1271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 1503.7857666015625, "completions/mean_terminated_length": 1503.7857666015625, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 1.9813084112149533, "grad_norm": 0.6268497705459595, "kl": 0.053085023537278175, "learning_rate": 1.211875e-06, "loss": 0.0036, "num_tokens": 166144661.0, "reward": 1.4730051755905151, "reward_std": 0.11777850985527039, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4968147873878479, "rewards/correct_reward_func/std": 0.16005007922649384, "step": 1272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2143.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 1461.0595703125, "completions/mean_terminated_length": 1461.0595703125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 1.9828660436137073, "grad_norm": 0.6158193349838257, "kl": 0.05314483679831028, "learning_rate": 1.2112499999999998e-06, "loss": 0.0136, "num_tokens": 166273276.0, "reward": 1.5000544786453247, "reward_std": 0.055675625801086426, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5000544190406799, "rewards/correct_reward_func/std": 0.2088353931903839, "step": 1273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2449.0, "completions/max_terminated_length": 2449.0, "completions/mean_length": 1434.1309814453125, "completions/mean_terminated_length": 1434.1309814453125, "completions/min_length": 624.0, "completions/min_terminated_length": 624.0, "epoch": 1.9844236760124612, "grad_norm": 0.6059211492538452, "kl": 0.053006915375590324, "learning_rate": 1.210625e-06, "loss": 0.0088, "num_tokens": 166399791.0, "reward": 1.478981614112854, "reward_std": 0.05414564535021782, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47898170351982117, "rewards/correct_reward_func/std": 0.13632602989673615, "step": 1274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 1450.666748046875, "completions/mean_terminated_length": 1450.666748046875, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 1.985981308411215, "grad_norm": 0.6059961915016174, "kl": 0.05159132182598114, "learning_rate": 1.2099999999999998e-06, "loss": -0.0311, "num_tokens": 166527623.0, "reward": 1.4945504665374756, "reward_std": 0.10077395290136337, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.50645512342453, "rewards/correct_reward_func/std": 0.1674635410308838, "step": 1275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 1462.9881591796875, "completions/mean_terminated_length": 1462.9881591796875, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 1.9875389408099688, "grad_norm": 0.6390466094017029, "kl": 0.0553548913449049, "learning_rate": 1.209375e-06, "loss": 0.0296, "num_tokens": 166656448.0, "reward": 1.4924731254577637, "reward_std": 0.07603880763053894, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5043776631355286, "rewards/correct_reward_func/std": 0.17303898930549622, "step": 1276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2552.0, "completions/max_terminated_length": 2552.0, "completions/mean_length": 1562.9285888671875, "completions/mean_terminated_length": 1562.9285888671875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.9890965732087227, "grad_norm": 0.5586164593696594, "kl": 0.05111539550125599, "learning_rate": 1.2087499999999999e-06, "loss": -0.0299, "num_tokens": 166794034.0, "reward": 1.5329746007919312, "reward_std": 0.0800163522362709, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5329747200012207, "rewards/correct_reward_func/std": 0.17308349907398224, "step": 1277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 1419.952392578125, "completions/mean_terminated_length": 1419.952392578125, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 1.9906542056074765, "grad_norm": 0.6261523962020874, "kl": 0.05347118340432644, "learning_rate": 1.2081249999999998e-06, "loss": 0.0066, "num_tokens": 166919250.0, "reward": 1.5714763402938843, "reward_std": 0.05952037125825882, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5714763402938843, "rewards/correct_reward_func/std": 0.14183683693408966, "step": 1278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 1416.1429443359375, "completions/mean_terminated_length": 1416.1429443359375, "completions/min_length": 383.0, "completions/min_terminated_length": 383.0, "epoch": 1.9922118380062306, "grad_norm": 0.6015198230743408, "kl": 0.05422072671353817, "learning_rate": 1.2074999999999999e-06, "loss": -0.0194, "num_tokens": 167044344.0, "reward": 1.5053538084030151, "reward_std": 0.06031499803066254, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5053538084030151, "rewards/correct_reward_func/std": 0.15424591302871704, "step": 1279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3585.0, "completions/max_terminated_length": 3585.0, "completions/mean_length": 1447.5357666015625, "completions/mean_terminated_length": 1447.5357666015625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "epoch": 1.9937694704049844, "grad_norm": 0.6257075667381287, "kl": 0.05304074473679066, "learning_rate": 1.2068749999999998e-06, "loss": -0.0317, "num_tokens": 167171751.0, "reward": 1.4833705425262451, "reward_std": 0.09662158042192459, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4952751696109772, "rewards/correct_reward_func/std": 0.18059666454792023, "step": 1280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2210.0, "completions/max_terminated_length": 2210.0, "completions/mean_length": 1439.90478515625, "completions/mean_terminated_length": 1439.90478515625, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 1.9953271028037385, "grad_norm": 0.6328639984130859, "kl": 0.05405781976878643, "learning_rate": 1.2062499999999999e-06, "loss": 0.0111, "num_tokens": 167298697.0, "reward": 1.509008765220642, "reward_std": 0.09354681521654129, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5209135413169861, "rewards/correct_reward_func/std": 0.17299731075763702, "step": 1281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 1470.3690185546875, "completions/mean_terminated_length": 1470.3690185546875, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 1.9968847352024923, "grad_norm": 0.578328549861908, "kl": 0.050938211381435394, "learning_rate": 1.205625e-06, "loss": 0.0009, "num_tokens": 167428226.0, "reward": 1.557409405708313, "reward_std": 0.0640474259853363, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5574092864990234, "rewards/correct_reward_func/std": 0.14930056035518646, "step": 1282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1953.0, "completions/max_terminated_length": 1953.0, "completions/mean_length": 1403.297607421875, "completions/mean_terminated_length": 1403.297607421875, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 1.9984423676012462, "grad_norm": 0.5905470252037048, "kl": 0.051910923793911934, "learning_rate": 1.2050000000000001e-06, "loss": 0.0226, "num_tokens": 167552313.0, "reward": 1.5577044486999512, "reward_std": 0.05520172044634819, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5577042698860168, "rewards/correct_reward_func/std": 0.13589416444301605, "step": 1283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2232.0, "completions/max_terminated_length": 2232.0, "completions/mean_length": 1422.6190185546875, "completions/mean_terminated_length": 1422.6190185546875, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 2.0, "grad_norm": 0.6009348034858704, "kl": 0.05313561297953129, "learning_rate": 1.204375e-06, "loss": 0.0264, "num_tokens": 167677861.0, "reward": 1.511346697807312, "reward_std": 0.058749720454216, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5113465785980225, "rewards/correct_reward_func/std": 0.14649075269699097, "step": 1284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1387.75, "completions/mean_terminated_length": 1387.75, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 2.001557632398754, "grad_norm": 0.6198617219924927, "kl": 0.05458259582519531, "learning_rate": 1.2037500000000001e-06, "loss": 0.008, "num_tokens": 167800306.0, "reward": 1.4963420629501343, "reward_std": 0.05120270699262619, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4963420629501343, "rewards/correct_reward_func/std": 0.14947372674942017, "step": 1285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2658.0, "completions/max_terminated_length": 2658.0, "completions/mean_length": 1415.6190185546875, "completions/mean_terminated_length": 1415.6190185546875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 2.0031152647975077, "grad_norm": 0.6016020178794861, "kl": 0.05195882171392441, "learning_rate": 1.203125e-06, "loss": 0.0093, "num_tokens": 167925212.0, "reward": 1.5437531471252441, "reward_std": 0.05195217952132225, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5437529683113098, "rewards/correct_reward_func/std": 0.11240639537572861, "step": 1286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1495.6190185546875, "completions/mean_terminated_length": 1414.939697265625, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "epoch": 2.0046728971962615, "grad_norm": 0.6111792922019958, "kl": 0.05117208510637283, "learning_rate": 1.2025e-06, "loss": 0.0561, "num_tokens": 168056796.0, "reward": 1.4031814336776733, "reward_std": 0.09181969612836838, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.415086030960083, "rewards/correct_reward_func/std": 0.14939112961292267, "step": 1287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 1442.8333740234375, "completions/mean_terminated_length": 1442.8333740234375, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 2.0062305295950154, "grad_norm": 0.5975027680397034, "kl": 0.05319266952574253, "learning_rate": 1.201875e-06, "loss": 0.005, "num_tokens": 168184072.0, "reward": 1.5213496685028076, "reward_std": 0.1099877581000328, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.545159101486206, "rewards/correct_reward_func/std": 0.1262146681547165, "step": 1288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2224.0, "completions/max_terminated_length": 2224.0, "completions/mean_length": 1341.9761962890625, "completions/mean_terminated_length": 1341.9761962890625, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 2.0077881619937696, "grad_norm": 0.5900332927703857, "kl": 0.050959544256329536, "learning_rate": 1.20125e-06, "loss": 0.0118, "num_tokens": 168302738.0, "reward": 1.5180631875991821, "reward_std": 0.03214741870760918, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5180630683898926, "rewards/correct_reward_func/std": 0.1762181520462036, "step": 1289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1408.011962890625, "completions/mean_terminated_length": 1408.011962890625, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "epoch": 2.0093457943925235, "grad_norm": 0.6007463335990906, "kl": 0.05380294658243656, "learning_rate": 1.200625e-06, "loss": -0.0019, "num_tokens": 168426969.0, "reward": 1.519254207611084, "reward_std": 0.07921576499938965, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5311589241027832, "rewards/correct_reward_func/std": 0.13961145281791687, "step": 1290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1323.607177734375, "completions/mean_terminated_length": 1323.607177734375, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 2.0109034267912773, "grad_norm": 0.6135416626930237, "kl": 0.05574348382651806, "learning_rate": 1.2e-06, "loss": 0.0117, "num_tokens": 168544116.0, "reward": 1.443884253501892, "reward_std": 0.045881446450948715, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4438842535018921, "rewards/correct_reward_func/std": 0.13830821216106415, "step": 1291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 1403.2857666015625, "completions/mean_terminated_length": 1321.493896484375, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 2.012461059190031, "grad_norm": 0.5899112224578857, "kl": 0.05292821489274502, "learning_rate": 1.199375e-06, "loss": 0.0707, "num_tokens": 168667872.0, "reward": 1.482300877571106, "reward_std": 0.06602161377668381, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4823008179664612, "rewards/correct_reward_func/std": 0.1503794938325882, "step": 1292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1965.0, "completions/max_terminated_length": 1965.0, "completions/mean_length": 1418.6905517578125, "completions/mean_terminated_length": 1418.6905517578125, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 2.014018691588785, "grad_norm": 0.6301987767219543, "kl": 0.056902993470430374, "learning_rate": 1.19875e-06, "loss": 0.0251, "num_tokens": 168793150.0, "reward": 1.520931601524353, "reward_std": 0.05373011529445648, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.520931601524353, "rewards/correct_reward_func/std": 0.1550585776567459, "step": 1293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1818.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 1306.0357666015625, "completions/mean_terminated_length": 1306.0357666015625, "completions/min_length": 682.0, "completions/min_terminated_length": 682.0, "epoch": 2.015576323987539, "grad_norm": 0.6527206301689148, "kl": 0.05538877472281456, "learning_rate": 1.198125e-06, "loss": -0.0049, "num_tokens": 168908833.0, "reward": 1.4936171770095825, "reward_std": 0.09311771392822266, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5055219531059265, "rewards/correct_reward_func/std": 0.15839269757270813, "step": 1294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1307.7857666015625, "completions/mean_terminated_length": 1307.7857666015625, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 2.0171339563862927, "grad_norm": 0.6406489610671997, "kl": 0.05233604088425636, "learning_rate": 1.1975e-06, "loss": 0.0106, "num_tokens": 169024711.0, "reward": 1.4948046207427979, "reward_std": 0.07318916916847229, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5067092180252075, "rewards/correct_reward_func/std": 0.2039363980293274, "step": 1295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 1343.0, "completions/mean_terminated_length": 1343.0, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 2.0186915887850465, "grad_norm": 0.6360169053077698, "kl": 0.05363561399281025, "learning_rate": 1.1968749999999999e-06, "loss": 0.0117, "num_tokens": 169143355.0, "reward": 1.4313719272613525, "reward_std": 0.07863809168338776, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.443276584148407, "rewards/correct_reward_func/std": 0.19791553914546967, "step": 1296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1926.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 1324.761962890625, "completions/mean_terminated_length": 1324.761962890625, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 2.020249221183801, "grad_norm": 0.6539590358734131, "kl": 0.05420477129518986, "learning_rate": 1.19625e-06, "loss": -0.0228, "num_tokens": 169260431.0, "reward": 1.5739299058914185, "reward_std": 0.08958905190229416, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5858345031738281, "rewards/correct_reward_func/std": 0.20335423946380615, "step": 1297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2395.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 1381.5714111328125, "completions/mean_terminated_length": 1381.5714111328125, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 2.0218068535825546, "grad_norm": 0.6075211763381958, "kl": 0.05459413677453995, "learning_rate": 1.1956249999999999e-06, "loss": 0.0046, "num_tokens": 169382441.0, "reward": 1.4989190101623535, "reward_std": 0.056590426713228226, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4989190399646759, "rewards/correct_reward_func/std": 0.18089325726032257, "step": 1298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1369.357177734375, "completions/mean_terminated_length": 1369.357177734375, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 2.0233644859813085, "grad_norm": 0.6661106944084167, "kl": 0.05170141160488129, "learning_rate": 1.195e-06, "loss": 0.0021, "num_tokens": 169503311.0, "reward": 1.4624065160751343, "reward_std": 0.060307297855615616, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46240633726119995, "rewards/correct_reward_func/std": 0.151441290974617, "step": 1299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1378.65478515625, "completions/mean_terminated_length": 1378.65478515625, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 2.0249221183800623, "grad_norm": 0.6782479286193848, "kl": 0.0576251819729805, "learning_rate": 1.1943749999999999e-06, "loss": -0.0135, "num_tokens": 169625238.0, "reward": 1.5194391012191772, "reward_std": 0.062837615609169, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5194391012191772, "rewards/correct_reward_func/std": 0.17157082259655, "step": 1300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2032.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1366.7261962890625, "completions/mean_terminated_length": 1366.7261962890625, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 2.026479750778816, "grad_norm": 0.6088794469833374, "kl": 0.05408782698214054, "learning_rate": 1.19375e-06, "loss": 0.0202, "num_tokens": 169746037.0, "reward": 1.4859356880187988, "reward_std": 0.054425615817308426, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48593562841415405, "rewards/correct_reward_func/std": 0.12695002555847168, "step": 1301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 1401.166748046875, "completions/mean_terminated_length": 1401.166748046875, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 2.02803738317757, "grad_norm": 0.6174154281616211, "kl": 0.05535493791103363, "learning_rate": 1.193125e-06, "loss": -0.034, "num_tokens": 169869717.0, "reward": 1.504166603088379, "reward_std": 0.04865359887480736, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5041665434837341, "rewards/correct_reward_func/std": 0.14053207635879517, "step": 1302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1337.6785888671875, "completions/mean_terminated_length": 1337.6785888671875, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 2.029595015576324, "grad_norm": 0.6270648241043091, "kl": 0.05714597553014755, "learning_rate": 1.1924999999999998e-06, "loss": -0.0278, "num_tokens": 169987908.0, "reward": 1.5129345655441284, "reward_std": 0.06364456564188004, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5129345059394836, "rewards/correct_reward_func/std": 0.11493108421564102, "step": 1303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 1385.011962890625, "completions/mean_terminated_length": 1385.011962890625, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 2.0311526479750777, "grad_norm": 0.6225783824920654, "kl": 0.05746995285153389, "learning_rate": 1.191875e-06, "loss": -0.0147, "num_tokens": 170110237.0, "reward": 1.493137240409851, "reward_std": 0.056991107761859894, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49313727021217346, "rewards/correct_reward_func/std": 0.14563824236392975, "step": 1304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1371.2261962890625, "completions/mean_terminated_length": 1371.2261962890625, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 2.032710280373832, "grad_norm": 0.6455997228622437, "kl": 0.056764278560876846, "learning_rate": 1.1912499999999998e-06, "loss": -0.0234, "num_tokens": 170231426.0, "reward": 1.5041488409042358, "reward_std": 0.05794251710176468, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5041487812995911, "rewards/correct_reward_func/std": 0.11207922548055649, "step": 1305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1406.7738037109375, "completions/mean_terminated_length": 1406.7738037109375, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 2.034267912772586, "grad_norm": 0.594173789024353, "kl": 0.053897615522146225, "learning_rate": 1.190625e-06, "loss": 0.0222, "num_tokens": 170355607.0, "reward": 1.5034401416778564, "reward_std": 0.06129951775074005, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5034400224685669, "rewards/correct_reward_func/std": 0.1440412700176239, "step": 1306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2382.0, "completions/max_terminated_length": 2382.0, "completions/mean_length": 1419.6190185546875, "completions/mean_terminated_length": 1419.6190185546875, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 2.0358255451713396, "grad_norm": 0.5930730700492859, "kl": 0.054901355877518654, "learning_rate": 1.1899999999999998e-06, "loss": 0.0066, "num_tokens": 170480783.0, "reward": 1.4827772378921509, "reward_std": 0.06203296035528183, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48277708888053894, "rewards/correct_reward_func/std": 0.15417973697185516, "step": 1307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 1409.0357666015625, "completions/mean_terminated_length": 1409.0357666015625, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 2.0373831775700935, "grad_norm": 0.6383758783340454, "kl": 0.05286584980785847, "learning_rate": 1.189375e-06, "loss": 0.0232, "num_tokens": 170604980.0, "reward": 1.5658137798309326, "reward_std": 0.07685311883687973, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5658137798309326, "rewards/correct_reward_func/std": 0.19458825886249542, "step": 1308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 1404.857177734375, "completions/mean_terminated_length": 1404.857177734375, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "epoch": 2.0389408099688473, "grad_norm": 0.5920485258102417, "kl": 0.05119282379746437, "learning_rate": 1.1887499999999998e-06, "loss": 0.0032, "num_tokens": 170729114.0, "reward": 1.5448683500289917, "reward_std": 0.04954070970416069, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5448681712150574, "rewards/correct_reward_func/std": 0.12921468913555145, "step": 1309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1324.0833740234375, "completions/mean_terminated_length": 1324.0833740234375, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 2.040498442367601, "grad_norm": 0.6491631865501404, "kl": 0.0530067328363657, "learning_rate": 1.188125e-06, "loss": -0.0004, "num_tokens": 170846319.0, "reward": 1.6047366857528687, "reward_std": 0.05881234630942345, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.6047365069389343, "rewards/correct_reward_func/std": 0.11745820939540863, "step": 1310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 1431.46435546875, "completions/mean_terminated_length": 1431.46435546875, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "epoch": 2.042056074766355, "grad_norm": 0.563828706741333, "kl": 0.05154792591929436, "learning_rate": 1.1874999999999999e-06, "loss": 0.0153, "num_tokens": 170972448.0, "reward": 1.5478053092956543, "reward_std": 0.05429919809103012, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.54780513048172, "rewards/correct_reward_func/std": 0.16813106834888458, "step": 1311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2099.0, "completions/max_terminated_length": 2099.0, "completions/mean_length": 1439.952392578125, "completions/mean_terminated_length": 1439.952392578125, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 2.043613707165109, "grad_norm": 0.5618917942047119, "kl": 0.052127595990896225, "learning_rate": 1.1868749999999998e-06, "loss": 0.0351, "num_tokens": 171099482.0, "reward": 1.5230122804641724, "reward_std": 0.044129878282547, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5230123400688171, "rewards/correct_reward_func/std": 0.12101703882217407, "step": 1312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1435.2857666015625, "completions/mean_terminated_length": 1435.2857666015625, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 2.045171339563863, "grad_norm": 0.5968584418296814, "kl": 0.05271363630890846, "learning_rate": 1.18625e-06, "loss": 0.0192, "num_tokens": 171225986.0, "reward": 1.4837173223495483, "reward_std": 0.04450034722685814, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.483717143535614, "rewards/correct_reward_func/std": 0.13271783292293549, "step": 1313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2245.0, "completions/max_terminated_length": 2245.0, "completions/mean_length": 1479.916748046875, "completions/mean_terminated_length": 1479.916748046875, "completions/min_length": 984.0, "completions/min_terminated_length": 984.0, "epoch": 2.046728971962617, "grad_norm": 0.5781486630439758, "kl": 0.054236821830272675, "learning_rate": 1.185625e-06, "loss": 0.0271, "num_tokens": 171356263.0, "reward": 1.4423896074295044, "reward_std": 0.08699092268943787, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45429420471191406, "rewards/correct_reward_func/std": 0.1491469144821167, "step": 1314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1419.0833740234375, "completions/mean_terminated_length": 1419.0833740234375, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 2.0482866043613708, "grad_norm": 0.6145700812339783, "kl": 0.058677542954683304, "learning_rate": 1.185e-06, "loss": 0.005, "num_tokens": 171481502.0, "reward": 1.4967024326324463, "reward_std": 0.055205754935741425, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4967023432254791, "rewards/correct_reward_func/std": 0.17420785129070282, "step": 1315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 1449.3095703125, "completions/mean_terminated_length": 1449.3095703125, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 2.0498442367601246, "grad_norm": 0.5873131155967712, "kl": 0.055539270862936974, "learning_rate": 1.184375e-06, "loss": 0.0102, "num_tokens": 171609238.0, "reward": 1.482214093208313, "reward_std": 0.04849381744861603, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48221397399902344, "rewards/correct_reward_func/std": 0.13805167376995087, "step": 1316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2125.0, "completions/max_terminated_length": 2125.0, "completions/mean_length": 1421.25, "completions/mean_terminated_length": 1421.25, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 2.0514018691588785, "grad_norm": 0.5826941728591919, "kl": 0.053842829540371895, "learning_rate": 1.18375e-06, "loss": -0.0103, "num_tokens": 171734731.0, "reward": 1.4909722805023193, "reward_std": 0.08816754072904587, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5028769373893738, "rewards/correct_reward_func/std": 0.1646195948123932, "step": 1317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 1422.9405517578125, "completions/mean_terminated_length": 1422.9405517578125, "completions/min_length": 802.0, "completions/min_terminated_length": 802.0, "epoch": 2.0529595015576323, "grad_norm": 0.6292868256568909, "kl": 0.055219922214746475, "learning_rate": 1.183125e-06, "loss": 0.0108, "num_tokens": 171860132.0, "reward": 1.537479043006897, "reward_std": 0.08281675726175308, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5493836998939514, "rewards/correct_reward_func/std": 0.1704007387161255, "step": 1318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1397.46435546875, "completions/mean_terminated_length": 1397.46435546875, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "epoch": 2.054517133956386, "grad_norm": 0.6106641292572021, "kl": 0.0550366286188364, "learning_rate": 1.1825000000000001e-06, "loss": 0.0235, "num_tokens": 171983375.0, "reward": 1.3991354703903198, "reward_std": 0.08677927404642105, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4110400676727295, "rewards/correct_reward_func/std": 0.19159509241580963, "step": 1319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 1429.25, "completions/mean_terminated_length": 1429.25, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 2.05607476635514, "grad_norm": 0.5924327969551086, "kl": 0.05212895758450031, "learning_rate": 1.181875e-06, "loss": -0.0023, "num_tokens": 172109402.0, "reward": 1.4476702213287354, "reward_std": 0.0536293126642704, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44767022132873535, "rewards/correct_reward_func/std": 0.15320226550102234, "step": 1320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 1459.3929443359375, "completions/mean_terminated_length": 1378.277099609375, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 2.0576323987538943, "grad_norm": 0.5764317512512207, "kl": 0.051515115424990654, "learning_rate": 1.18125e-06, "loss": -0.0499, "num_tokens": 172237961.0, "reward": 1.5333306789398193, "reward_std": 0.07086651027202606, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5333305597305298, "rewards/correct_reward_func/std": 0.13200446963310242, "step": 1321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1370.511962890625, "completions/mean_terminated_length": 1370.511962890625, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 2.059190031152648, "grad_norm": 0.5975598692893982, "kl": 0.05376181751489639, "learning_rate": 1.180625e-06, "loss": -0.0006, "num_tokens": 172359012.0, "reward": 1.443628191947937, "reward_std": 0.07048165053129196, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4436279535293579, "rewards/correct_reward_func/std": 0.15080465376377106, "step": 1322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 1385.9761962890625, "completions/mean_terminated_length": 1385.9761962890625, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 2.060747663551402, "grad_norm": 0.6012426614761353, "kl": 0.05235354043543339, "learning_rate": 1.18e-06, "loss": -0.0336, "num_tokens": 172481350.0, "reward": 1.537935733795166, "reward_std": 0.043422479182481766, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5379356741905212, "rewards/correct_reward_func/std": 0.13390177488327026, "step": 1323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1506.2738037109375, "completions/mean_terminated_length": 1425.7227783203125, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 2.0623052959501558, "grad_norm": 0.5477936863899231, "kl": 0.05041305534541607, "learning_rate": 1.179375e-06, "loss": -0.0015, "num_tokens": 172613985.0, "reward": 1.4686349630355835, "reward_std": 0.045441679656505585, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46863484382629395, "rewards/correct_reward_func/std": 0.11044660955667496, "step": 1324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2214.0, "completions/mean_length": 1488.40478515625, "completions/mean_terminated_length": 1407.6385498046875, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 2.0638629283489096, "grad_norm": 0.5896651148796082, "kl": 0.05202721990644932, "learning_rate": 1.17875e-06, "loss": -0.0476, "num_tokens": 172745131.0, "reward": 1.5267891883850098, "reward_std": 0.13563129305839539, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.550598680973053, "rewards/correct_reward_func/std": 0.18807443976402283, "step": 1325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2126.0, "completions/max_terminated_length": 2126.0, "completions/mean_length": 1417.6785888671875, "completions/mean_terminated_length": 1417.6785888671875, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 2.0654205607476634, "grad_norm": 0.6207713484764099, "kl": 0.052984775975346565, "learning_rate": 1.178125e-06, "loss": 0.0216, "num_tokens": 172870222.0, "reward": 1.5294607877731323, "reward_std": 0.07811157405376434, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5413654446601868, "rewards/correct_reward_func/std": 0.11763960868120193, "step": 1326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 1423.1785888671875, "completions/mean_terminated_length": 1423.1785888671875, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.0669781931464173, "grad_norm": 0.5963159799575806, "kl": 0.050035351887345314, "learning_rate": 1.1775e-06, "loss": -0.0149, "num_tokens": 172995919.0, "reward": 1.486838459968567, "reward_std": 0.0780094787478447, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4987431466579437, "rewards/correct_reward_func/std": 0.13482625782489777, "step": 1327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1981.0, "completions/max_terminated_length": 1981.0, "completions/mean_length": 1406.2738037109375, "completions/mean_terminated_length": 1406.2738037109375, "completions/min_length": 989.0, "completions/min_terminated_length": 989.0, "epoch": 2.068535825545171, "grad_norm": 0.595504879951477, "kl": 0.05312320031225681, "learning_rate": 1.1768749999999998e-06, "loss": 0.0069, "num_tokens": 173119914.0, "reward": 1.5552579164505005, "reward_std": 0.05643022060394287, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5552579164505005, "rewards/correct_reward_func/std": 0.10491575300693512, "step": 1328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 1460.357177734375, "completions/mean_terminated_length": 1379.2529296875, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 2.0700934579439254, "grad_norm": 0.5734352469444275, "kl": 0.05355260521173477, "learning_rate": 1.17625e-06, "loss": -0.0665, "num_tokens": 173248530.0, "reward": 1.4935506582260132, "reward_std": 0.06551594287157059, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49355053901672363, "rewards/correct_reward_func/std": 0.15725794434547424, "step": 1329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2042.0, "completions/max_terminated_length": 2042.0, "completions/mean_length": 1402.8095703125, "completions/mean_terminated_length": 1402.8095703125, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 2.0716510903426792, "grad_norm": 0.5710020065307617, "kl": 0.0533020943403244, "learning_rate": 1.1756249999999999e-06, "loss": -0.0114, "num_tokens": 173372270.0, "reward": 1.509724497795105, "reward_std": 0.07798776030540466, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5097243785858154, "rewards/correct_reward_func/std": 0.16208356618881226, "step": 1330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1916.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 1379.9285888671875, "completions/mean_terminated_length": 1379.9285888671875, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 2.073208722741433, "grad_norm": 0.6016309857368469, "kl": 0.05197571963071823, "learning_rate": 1.175e-06, "loss": 0.0024, "num_tokens": 173494058.0, "reward": 1.591169834136963, "reward_std": 0.05961233377456665, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5911697149276733, "rewards/correct_reward_func/std": 0.13058185577392578, "step": 1331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2419.0, "completions/max_terminated_length": 2419.0, "completions/mean_length": 1400.3333740234375, "completions/mean_terminated_length": 1400.3333740234375, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 2.074766355140187, "grad_norm": 0.5705133676528931, "kl": 0.051452573388814926, "learning_rate": 1.1743749999999999e-06, "loss": 0.0022, "num_tokens": 173617776.0, "reward": 1.5316303968429565, "reward_std": 0.07902618497610092, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5316303968429565, "rewards/correct_reward_func/std": 0.17537401616573334, "step": 1332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2056.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 1469.2381591796875, "completions/mean_terminated_length": 1469.2381591796875, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 2.0763239875389408, "grad_norm": 0.5970563292503357, "kl": 0.05328632518649101, "learning_rate": 1.17375e-06, "loss": -0.0104, "num_tokens": 173747132.0, "reward": 1.5186409950256348, "reward_std": 0.08987218141555786, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.530545711517334, "rewards/correct_reward_func/std": 0.19355741143226624, "step": 1333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2237.0, "completions/max_terminated_length": 2237.0, "completions/mean_length": 1478.71435546875, "completions/mean_terminated_length": 1478.71435546875, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 2.0778816199376946, "grad_norm": 0.5974871516227722, "kl": 0.05466659180819988, "learning_rate": 1.1731249999999999e-06, "loss": 0.0139, "num_tokens": 173877272.0, "reward": 1.4784029722213745, "reward_std": 0.10357818007469177, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49030765891075134, "rewards/correct_reward_func/std": 0.1401202380657196, "step": 1334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 1471.011962890625, "completions/mean_terminated_length": 1471.011962890625, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 2.0794392523364484, "grad_norm": 0.5724372863769531, "kl": 0.05292382277548313, "learning_rate": 1.1725e-06, "loss": 0.0111, "num_tokens": 174006693.0, "reward": 1.4356780052185059, "reward_std": 0.0720079094171524, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4475826919078827, "rewards/correct_reward_func/std": 0.09196102619171143, "step": 1335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1452.6785888671875, "completions/mean_terminated_length": 1452.6785888671875, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "epoch": 2.0809968847352023, "grad_norm": 0.6357861161231995, "kl": 0.05236661992967129, "learning_rate": 1.171875e-06, "loss": -0.0127, "num_tokens": 174134760.0, "reward": 1.5508917570114136, "reward_std": 0.06290838867425919, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5508916974067688, "rewards/correct_reward_func/std": 0.09926848858594894, "step": 1336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1447.797607421875, "completions/mean_terminated_length": 1447.797607421875, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 2.0825545171339566, "grad_norm": 0.5405333638191223, "kl": 0.05155564658343792, "learning_rate": 1.1712499999999998e-06, "loss": 0.0127, "num_tokens": 174262351.0, "reward": 1.4611423015594482, "reward_std": 0.04483713582158089, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4611421227455139, "rewards/correct_reward_func/std": 0.14346285164356232, "step": 1337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 1483.8333740234375, "completions/mean_terminated_length": 1483.8333740234375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.0841121495327104, "grad_norm": 0.5998951196670532, "kl": 0.052249545231461525, "learning_rate": 1.170625e-06, "loss": -0.0003, "num_tokens": 174393017.0, "reward": 1.5021874904632568, "reward_std": 0.10547463595867157, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5259968638420105, "rewards/correct_reward_func/std": 0.15751409530639648, "step": 1338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1433.6785888671875, "completions/mean_terminated_length": 1433.6785888671875, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 2.0856697819314642, "grad_norm": 0.5774150490760803, "kl": 0.05220690928399563, "learning_rate": 1.1699999999999998e-06, "loss": -0.022, "num_tokens": 174519230.0, "reward": 1.5210987329483032, "reward_std": 0.07365919649600983, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5210986137390137, "rewards/correct_reward_func/std": 0.14650999009609222, "step": 1339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 1517.5238037109375, "completions/mean_terminated_length": 1517.5238037109375, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 2.087227414330218, "grad_norm": 0.6292716860771179, "kl": 0.0527616161853075, "learning_rate": 1.169375e-06, "loss": -0.003, "num_tokens": 174652696.0, "reward": 1.468656063079834, "reward_std": 0.10742703080177307, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4924655258655548, "rewards/correct_reward_func/std": 0.16501155495643616, "step": 1340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1497.21435546875, "completions/mean_terminated_length": 1497.21435546875, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 2.088785046728972, "grad_norm": 0.5948904752731323, "kl": 0.05212790332734585, "learning_rate": 1.1687499999999998e-06, "loss": 0.0633, "num_tokens": 174784468.0, "reward": 1.5465943813323975, "reward_std": 0.11266262084245682, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5584990978240967, "rewards/correct_reward_func/std": 0.18359963595867157, "step": 1341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2150.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 1562.4761962890625, "completions/mean_terminated_length": 1562.4761962890625, "completions/min_length": 989.0, "completions/min_terminated_length": 989.0, "epoch": 2.0903426791277258, "grad_norm": 0.5656182169914246, "kl": 0.053281234577298164, "learning_rate": 1.168125e-06, "loss": -0.0094, "num_tokens": 174921824.0, "reward": 1.5474061965942383, "reward_std": 0.05667465925216675, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5474059581756592, "rewards/correct_reward_func/std": 0.1580144464969635, "step": 1342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 1491.702392578125, "completions/mean_terminated_length": 1491.702392578125, "completions/min_length": 1050.0, "completions/min_terminated_length": 1050.0, "epoch": 2.0919003115264796, "grad_norm": 0.5756015777587891, "kl": 0.05138104036450386, "learning_rate": 1.1674999999999998e-06, "loss": 0.0176, "num_tokens": 175053007.0, "reward": 1.5201773643493652, "reward_std": 0.060937099158763885, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5201773047447205, "rewards/correct_reward_func/std": 0.14612407982349396, "step": 1343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1431.0833740234375, "completions/mean_terminated_length": 1431.0833740234375, "completions/min_length": 1058.0, "completions/min_terminated_length": 1058.0, "epoch": 2.0934579439252334, "grad_norm": 0.5880382657051086, "kl": 0.053947100415825844, "learning_rate": 1.1668750000000002e-06, "loss": 0.0033, "num_tokens": 175179044.0, "reward": 1.5036864280700684, "reward_std": 0.055334627628326416, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5036864280700684, "rewards/correct_reward_func/std": 0.2022797167301178, "step": 1344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 1503.8929443359375, "completions/mean_terminated_length": 1503.8929443359375, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 2.0950155763239877, "grad_norm": 0.6069063544273376, "kl": 0.052782196551561356, "learning_rate": 1.16625e-06, "loss": 0.012, "num_tokens": 175311299.0, "reward": 1.4483745098114014, "reward_std": 0.07644081860780716, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4602791965007782, "rewards/correct_reward_func/std": 0.15015453100204468, "step": 1345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1538.3333740234375, "completions/mean_terminated_length": 1538.3333740234375, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 2.0965732087227416, "grad_norm": 0.581824541091919, "kl": 0.05427423492074013, "learning_rate": 1.165625e-06, "loss": 0.0049, "num_tokens": 175446585.0, "reward": 1.4912240505218506, "reward_std": 0.05229996517300606, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4912240207195282, "rewards/correct_reward_func/std": 0.11860369145870209, "step": 1346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1448.0238037109375, "completions/mean_terminated_length": 1448.0238037109375, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 2.0981308411214954, "grad_norm": 0.6432933807373047, "kl": 0.05224813334643841, "learning_rate": 1.165e-06, "loss": -0.0057, "num_tokens": 175574333.0, "reward": 1.5023880004882812, "reward_std": 0.0744994655251503, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5023878812789917, "rewards/correct_reward_func/std": 0.15816380083560944, "step": 1347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1453.7261962890625, "completions/mean_terminated_length": 1453.7261962890625, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 2.0996884735202492, "grad_norm": 0.5670420527458191, "kl": 0.05200260318815708, "learning_rate": 1.164375e-06, "loss": -0.0156, "num_tokens": 175702410.0, "reward": 1.5416178703308105, "reward_std": 0.04456706345081329, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.541617751121521, "rewards/correct_reward_func/std": 0.14189304411411285, "step": 1348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 1462.59521484375, "completions/mean_terminated_length": 1462.59521484375, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 2.101246105919003, "grad_norm": 0.6278854608535767, "kl": 0.05100688897073269, "learning_rate": 1.16375e-06, "loss": 0.0086, "num_tokens": 175831190.0, "reward": 1.525302529335022, "reward_std": 0.051054563373327255, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.525302529335022, "rewards/correct_reward_func/std": 0.15514932572841644, "step": 1349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 1463.5, "completions/mean_terminated_length": 1463.5, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 2.102803738317757, "grad_norm": 0.6155849099159241, "kl": 0.053466156125068665, "learning_rate": 1.163125e-06, "loss": -0.0092, "num_tokens": 175960178.0, "reward": 1.4540088176727295, "reward_std": 0.061363790184259415, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45400872826576233, "rewards/correct_reward_func/std": 0.16609704494476318, "step": 1350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2127.0, "completions/max_terminated_length": 2127.0, "completions/mean_length": 1445.3690185546875, "completions/mean_terminated_length": 1445.3690185546875, "completions/min_length": 1019.0, "completions/min_terminated_length": 1019.0, "epoch": 2.1043613707165107, "grad_norm": 0.5737720131874084, "kl": 0.055760402232408524, "learning_rate": 1.1625e-06, "loss": -0.002, "num_tokens": 176087451.0, "reward": 1.4619553089141846, "reward_std": 0.0792599618434906, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4619552195072174, "rewards/correct_reward_func/std": 0.1473221480846405, "step": 1351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2231.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 1480.6429443359375, "completions/mean_terminated_length": 1480.6429443359375, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 2.1059190031152646, "grad_norm": 0.5649061799049377, "kl": 0.05205049365758896, "learning_rate": 1.161875e-06, "loss": 0.01, "num_tokens": 176217981.0, "reward": 1.4850581884384155, "reward_std": 0.07158853113651276, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4969627857208252, "rewards/correct_reward_func/std": 0.11516907066106796, "step": 1352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2231.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 1418.702392578125, "completions/mean_terminated_length": 1418.702392578125, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 2.107476635514019, "grad_norm": 0.6122363805770874, "kl": 0.05265624262392521, "learning_rate": 1.1612499999999999e-06, "loss": -0.022, "num_tokens": 176343008.0, "reward": 1.4755033254623413, "reward_std": 0.08632132411003113, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4874080419540405, "rewards/correct_reward_func/std": 0.13778482377529144, "step": 1353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1420.5, "completions/mean_terminated_length": 1420.5, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "epoch": 2.1090342679127727, "grad_norm": 0.5810578465461731, "kl": 0.05188661254942417, "learning_rate": 1.160625e-06, "loss": -0.0069, "num_tokens": 176468054.0, "reward": 1.5177085399627686, "reward_std": 0.06032872200012207, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.517708420753479, "rewards/correct_reward_func/std": 0.1892070472240448, "step": 1354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2090.0, "completions/max_terminated_length": 2090.0, "completions/mean_length": 1422.761962890625, "completions/mean_terminated_length": 1422.761962890625, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 2.1105919003115265, "grad_norm": 0.5986347794532776, "kl": 0.05370071157813072, "learning_rate": 1.16e-06, "loss": -0.0035, "num_tokens": 176593650.0, "reward": 1.56771981716156, "reward_std": 0.04732222855091095, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5677196979522705, "rewards/correct_reward_func/std": 0.13071207702159882, "step": 1355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1396.9881591796875, "completions/mean_terminated_length": 1396.9881591796875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 2.1121495327102804, "grad_norm": 0.6134873628616333, "kl": 0.053311556577682495, "learning_rate": 1.159375e-06, "loss": -0.0217, "num_tokens": 176716883.0, "reward": 1.4658392667770386, "reward_std": 0.05525967851281166, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46583911776542664, "rewards/correct_reward_func/std": 0.1776461899280548, "step": 1356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2232.0, "completions/max_terminated_length": 2232.0, "completions/mean_length": 1402.46435546875, "completions/mean_terminated_length": 1402.46435546875, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "epoch": 2.1137071651090342, "grad_norm": 0.6313929557800293, "kl": 0.05237478204071522, "learning_rate": 1.15875e-06, "loss": 0.024, "num_tokens": 176840564.0, "reward": 1.517069935798645, "reward_std": 0.08432690799236298, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.528974711894989, "rewards/correct_reward_func/std": 0.15727616846561432, "step": 1357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1943.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 1381.34521484375, "completions/mean_terminated_length": 1381.34521484375, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 2.115264797507788, "grad_norm": 0.589896023273468, "kl": 0.052048154175281525, "learning_rate": 1.158125e-06, "loss": -0.0003, "num_tokens": 176962477.0, "reward": 1.5122143030166626, "reward_std": 0.05098987743258476, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.512214183807373, "rewards/correct_reward_func/std": 0.17623625695705414, "step": 1358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 1411.8214111328125, "completions/mean_terminated_length": 1411.8214111328125, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 2.116822429906542, "grad_norm": 0.60505211353302, "kl": 0.05234862491488457, "learning_rate": 1.1575e-06, "loss": -0.0018, "num_tokens": 177087052.0, "reward": 1.5181258916854858, "reward_std": 0.05298001691699028, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5181258320808411, "rewards/correct_reward_func/std": 0.1747678518295288, "step": 1359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 1414.90478515625, "completions/mean_terminated_length": 1414.90478515625, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 2.1183800623052957, "grad_norm": 0.6217461228370667, "kl": 0.052322614938020706, "learning_rate": 1.156875e-06, "loss": 0.0022, "num_tokens": 177211796.0, "reward": 1.4650204181671143, "reward_std": 0.08573612570762634, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47692516446113586, "rewards/correct_reward_func/std": 0.11374403536319733, "step": 1360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1408.916748046875, "completions/mean_terminated_length": 1408.916748046875, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 2.11993769470405, "grad_norm": 0.6019426584243774, "kl": 0.05197778902947903, "learning_rate": 1.15625e-06, "loss": -0.0038, "num_tokens": 177336121.0, "reward": 1.5394080877304077, "reward_std": 0.08580493927001953, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5513127446174622, "rewards/correct_reward_func/std": 0.150873601436615, "step": 1361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 1454.797607421875, "completions/mean_terminated_length": 1454.797607421875, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 2.121495327102804, "grad_norm": 0.5814992189407349, "kl": 0.050834858790040016, "learning_rate": 1.1556249999999998e-06, "loss": -0.0239, "num_tokens": 177464366.0, "reward": 1.5059977769851685, "reward_std": 0.0810081884264946, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5179025530815125, "rewards/correct_reward_func/std": 0.1425192803144455, "step": 1362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2032.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 1472.9285888671875, "completions/mean_terminated_length": 1472.9285888671875, "completions/min_length": 999.0, "completions/min_terminated_length": 999.0, "epoch": 2.1230529595015577, "grad_norm": 0.5817736983299255, "kl": 0.05163421109318733, "learning_rate": 1.155e-06, "loss": -0.0024, "num_tokens": 177594164.0, "reward": 1.5244640111923218, "reward_std": 0.06062449887394905, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.524463951587677, "rewards/correct_reward_func/std": 0.14554931223392487, "step": 1363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 1414.7381591796875, "completions/mean_terminated_length": 1414.7381591796875, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 2.1246105919003115, "grad_norm": 0.5768776535987854, "kl": 0.05143258720636368, "learning_rate": 1.1543749999999999e-06, "loss": -0.0006, "num_tokens": 177718882.0, "reward": 1.5862325429916382, "reward_std": 0.059033941477537155, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5862324833869934, "rewards/correct_reward_func/std": 0.1760243922472, "step": 1364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 1498.7381591796875, "completions/mean_terminated_length": 1498.7381591796875, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 2.1261682242990654, "grad_norm": 0.5604124069213867, "kl": 0.0480644553899765, "learning_rate": 1.15375e-06, "loss": -0.0063, "num_tokens": 177850632.0, "reward": 1.5557256937026978, "reward_std": 0.06172781437635422, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.555725634098053, "rewards/correct_reward_func/std": 0.15447255969047546, "step": 1365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2126.0, "completions/max_terminated_length": 2126.0, "completions/mean_length": 1420.6190185546875, "completions/mean_terminated_length": 1420.6190185546875, "completions/min_length": 988.0, "completions/min_terminated_length": 988.0, "epoch": 2.127725856697819, "grad_norm": 0.6465847492218018, "kl": 0.052855467423796654, "learning_rate": 1.1531249999999999e-06, "loss": -0.0062, "num_tokens": 177975874.0, "reward": 1.5054396390914917, "reward_std": 0.0655108094215393, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5054395198822021, "rewards/correct_reward_func/std": 0.1440633237361908, "step": 1366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1428.25, "completions/mean_terminated_length": 1428.25, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 2.129283489096573, "grad_norm": 0.6005131602287292, "kl": 0.05210709199309349, "learning_rate": 1.1525e-06, "loss": 0.0056, "num_tokens": 178101883.0, "reward": 1.5685352087020874, "reward_std": 0.08305076509714127, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5804400444030762, "rewards/correct_reward_func/std": 0.16943597793579102, "step": 1367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2011.0, "completions/mean_length": 1517.5, "completions/mean_terminated_length": 1437.084228515625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 2.130841121495327, "grad_norm": 0.5751814842224121, "kl": 0.047493694350123405, "learning_rate": 1.1518749999999999e-06, "loss": 0.0632, "num_tokens": 178235287.0, "reward": 1.519307255744934, "reward_std": 0.08155255019664764, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5193071961402893, "rewards/correct_reward_func/std": 0.15516436100006104, "step": 1368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2448.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 1448.1429443359375, "completions/mean_terminated_length": 1448.1429443359375, "completions/min_length": 1010.0, "completions/min_terminated_length": 1010.0, "epoch": 2.132398753894081, "grad_norm": 0.6071665287017822, "kl": 0.04949713498353958, "learning_rate": 1.15125e-06, "loss": 0.0229, "num_tokens": 178362913.0, "reward": 1.54558527469635, "reward_std": 0.044393327087163925, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5455851554870605, "rewards/correct_reward_func/std": 0.1631070077419281, "step": 1369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 1437.607177734375, "completions/mean_terminated_length": 1437.607177734375, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 2.133956386292835, "grad_norm": 0.5561443567276001, "kl": 0.05203833431005478, "learning_rate": 1.1506249999999999e-06, "loss": -0.0037, "num_tokens": 178489792.0, "reward": 1.5648730993270874, "reward_std": 0.07552195340394974, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5648730397224426, "rewards/correct_reward_func/std": 0.16755522787570953, "step": 1370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 1361.71435546875, "completions/mean_terminated_length": 1361.71435546875, "completions/min_length": 501.0, "completions/min_terminated_length": 501.0, "epoch": 2.135514018691589, "grad_norm": 0.6631940007209778, "kl": 0.05380826257169247, "learning_rate": 1.1499999999999998e-06, "loss": 0.0437, "num_tokens": 178610080.0, "reward": 1.4864405393600464, "reward_std": 0.07657842338085175, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4983453154563904, "rewards/correct_reward_func/std": 0.12738528847694397, "step": 1371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 1448.9761962890625, "completions/mean_terminated_length": 1448.9761962890625, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 2.1370716510903427, "grad_norm": 0.5592918395996094, "kl": 0.05251946114003658, "learning_rate": 1.149375e-06, "loss": 0.0364, "num_tokens": 178737878.0, "reward": 1.5119584798812866, "reward_std": 0.08927936851978302, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5119584202766418, "rewards/correct_reward_func/std": 0.16680066287517548, "step": 1372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 1446.2738037109375, "completions/mean_terminated_length": 1446.2738037109375, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 2.1386292834890965, "grad_norm": 0.5679020285606384, "kl": 0.05038563534617424, "learning_rate": 1.1487499999999998e-06, "loss": 0.0138, "num_tokens": 178865497.0, "reward": 1.5020157098770142, "reward_std": 0.0990496501326561, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5258252024650574, "rewards/correct_reward_func/std": 0.12735188007354736, "step": 1373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1417.4761962890625, "completions/mean_terminated_length": 1417.4761962890625, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "epoch": 2.1401869158878504, "grad_norm": 0.6142663359642029, "kl": 0.052816612645983696, "learning_rate": 1.148125e-06, "loss": 0.0096, "num_tokens": 178990637.0, "reward": 1.5503965616226196, "reward_std": 0.059045616537332535, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5503966212272644, "rewards/correct_reward_func/std": 0.16228261590003967, "step": 1374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 1433.8929443359375, "completions/mean_terminated_length": 1433.8929443359375, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 2.141744548286604, "grad_norm": 0.5747750401496887, "kl": 0.0522424541413784, "learning_rate": 1.1474999999999998e-06, "loss": 0.0014, "num_tokens": 179117192.0, "reward": 1.5496505498886108, "reward_std": 0.04271262511610985, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5496504902839661, "rewards/correct_reward_func/std": 0.17081446945667267, "step": 1375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1436.5238037109375, "completions/mean_terminated_length": 1436.5238037109375, "completions/min_length": 1099.0, "completions/min_terminated_length": 1099.0, "epoch": 2.143302180685358, "grad_norm": 0.5699112415313721, "kl": 0.05083259753882885, "learning_rate": 1.1468750000000001e-06, "loss": 0.0128, "num_tokens": 179244034.0, "reward": 1.4924689531326294, "reward_std": 0.06198276951909065, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49246877431869507, "rewards/correct_reward_func/std": 0.15663869678974152, "step": 1376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1455.1309814453125, "completions/mean_terminated_length": 1455.1309814453125, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "epoch": 2.1448598130841123, "grad_norm": 0.5994657874107361, "kl": 0.049207188189029694, "learning_rate": 1.14625e-06, "loss": 0.0054, "num_tokens": 179372337.0, "reward": 1.5298857688903809, "reward_std": 0.05997888371348381, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5298855900764465, "rewards/correct_reward_func/std": 0.12205464392900467, "step": 1377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 1426.9405517578125, "completions/mean_terminated_length": 1426.9405517578125, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "epoch": 2.146417445482866, "grad_norm": 0.5947996377944946, "kl": 0.05212075263261795, "learning_rate": 1.145625e-06, "loss": -0.0104, "num_tokens": 179498140.0, "reward": 1.486358642578125, "reward_std": 0.05221645161509514, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.486358642578125, "rewards/correct_reward_func/std": 0.14084048569202423, "step": 1378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1354.392822265625, "completions/mean_terminated_length": 1354.392822265625, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 2.14797507788162, "grad_norm": 0.5759725570678711, "kl": 0.05035865865647793, "learning_rate": 1.145e-06, "loss": 0.0362, "num_tokens": 179617759.0, "reward": 1.5036530494689941, "reward_std": 0.07476876676082611, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5155577063560486, "rewards/correct_reward_func/std": 0.12357696890830994, "step": 1379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2171.0, "completions/max_terminated_length": 2171.0, "completions/mean_length": 1329.297607421875, "completions/mean_terminated_length": 1329.297607421875, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 2.149532710280374, "grad_norm": 0.6369271278381348, "kl": 0.05151879042387009, "learning_rate": 1.144375e-06, "loss": 0.0149, "num_tokens": 179735222.0, "reward": 1.567125678062439, "reward_std": 0.0595085471868515, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5671256184577942, "rewards/correct_reward_func/std": 0.194367915391922, "step": 1380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2548.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 1442.011962890625, "completions/mean_terminated_length": 1442.011962890625, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 2.1510903426791277, "grad_norm": 0.6099092364311218, "kl": 0.05381721816956997, "learning_rate": 1.14375e-06, "loss": 0.0295, "num_tokens": 179862219.0, "reward": 1.5155208110809326, "reward_std": 0.09162039309740067, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5274254679679871, "rewards/correct_reward_func/std": 0.1108357161283493, "step": 1381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 1302.8214111328125, "completions/mean_terminated_length": 1302.8214111328125, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 2.1526479750778815, "grad_norm": 0.6265305876731873, "kl": 0.05349032022058964, "learning_rate": 1.143125e-06, "loss": -0.0194, "num_tokens": 179977464.0, "reward": 1.5195810794830322, "reward_std": 0.05852867290377617, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5195810794830322, "rewards/correct_reward_func/std": 0.15797485411167145, "step": 1382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 1297.15478515625, "completions/mean_terminated_length": 1297.15478515625, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 2.1542056074766354, "grad_norm": 0.6974707245826721, "kl": 0.05273613519966602, "learning_rate": 1.1425e-06, "loss": -0.0019, "num_tokens": 180092293.0, "reward": 1.4925274848937988, "reward_std": 0.07071401178836823, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49252745509147644, "rewards/correct_reward_func/std": 0.12113825231790543, "step": 1383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 1320.6785888671875, "completions/mean_terminated_length": 1320.6785888671875, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 2.155763239875389, "grad_norm": 0.6379164457321167, "kl": 0.05261093005537987, "learning_rate": 1.141875e-06, "loss": -0.0023, "num_tokens": 180209146.0, "reward": 1.4076426029205322, "reward_std": 0.04903624206781387, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4076424241065979, "rewards/correct_reward_func/std": 0.14622081816196442, "step": 1384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1326.3095703125, "completions/mean_terminated_length": 1326.3095703125, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 2.1573208722741435, "grad_norm": 0.6310299634933472, "kl": 0.0517729464918375, "learning_rate": 1.14125e-06, "loss": 0.0117, "num_tokens": 180326364.0, "reward": 1.5183415412902832, "reward_std": 0.06364191323518753, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5183414220809937, "rewards/correct_reward_func/std": 0.18144991993904114, "step": 1385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 1338.202392578125, "completions/mean_terminated_length": 1338.202392578125, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 2.1588785046728973, "grad_norm": 0.6296771764755249, "kl": 0.050496408715844154, "learning_rate": 1.140625e-06, "loss": -0.0138, "num_tokens": 180444665.0, "reward": 1.5802043676376343, "reward_std": 0.0711468905210495, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5802043676376343, "rewards/correct_reward_func/std": 0.14951321482658386, "step": 1386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1874.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1337.3333740234375, "completions/mean_terminated_length": 1337.3333740234375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 2.160436137071651, "grad_norm": 0.631397008895874, "kl": 0.05410364829003811, "learning_rate": 1.1399999999999999e-06, "loss": -0.0087, "num_tokens": 180562995.0, "reward": 1.4374275207519531, "reward_std": 0.057850684970617294, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43742743134498596, "rewards/correct_reward_func/std": 0.13749293982982635, "step": 1387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1996.0, "completions/max_terminated_length": 1996.0, "completions/mean_length": 1289.3809814453125, "completions/mean_terminated_length": 1289.3809814453125, "completions/min_length": 790.0, "completions/min_terminated_length": 790.0, "epoch": 2.161993769470405, "grad_norm": 0.6206005215644836, "kl": 0.05362395569682121, "learning_rate": 1.139375e-06, "loss": 0.0176, "num_tokens": 180677141.0, "reward": 1.497544765472412, "reward_std": 0.06361046433448792, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49754488468170166, "rewards/correct_reward_func/std": 0.15940718352794647, "step": 1388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 1516.4285888671875, "completions/mean_terminated_length": 1436.0, "completions/min_length": 642.0, "completions/min_terminated_length": 642.0, "epoch": 2.163551401869159, "grad_norm": 0.5814381837844849, "kl": 0.05097048543393612, "learning_rate": 1.13875e-06, "loss": 0.0706, "num_tokens": 180810545.0, "reward": 1.5067657232284546, "reward_std": 0.05606215447187424, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5067654848098755, "rewards/correct_reward_func/std": 0.16191858053207397, "step": 1389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1837.0, "completions/max_terminated_length": 1837.0, "completions/mean_length": 1314.2857666015625, "completions/mean_terminated_length": 1314.2857666015625, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 2.1651090342679127, "grad_norm": 0.6008009314537048, "kl": 0.05112711153924465, "learning_rate": 1.138125e-06, "loss": -0.0125, "num_tokens": 180926885.0, "reward": 1.555004358291626, "reward_std": 0.05071486160159111, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5550042986869812, "rewards/correct_reward_func/std": 0.1640191525220871, "step": 1390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1926.0, "completions/max_terminated_length": 1926.0, "completions/mean_length": 1268.416748046875, "completions/mean_terminated_length": 1268.416748046875, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 2.1666666666666665, "grad_norm": 0.6407901048660278, "kl": 0.05324101261794567, "learning_rate": 1.1375e-06, "loss": 0.0091, "num_tokens": 181039174.0, "reward": 1.4948005676269531, "reward_std": 0.0587511770427227, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49480050802230835, "rewards/correct_reward_func/std": 0.1258733719587326, "step": 1391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1800.0, "completions/max_terminated_length": 1800.0, "completions/mean_length": 1303.297607421875, "completions/mean_terminated_length": 1303.297607421875, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 2.1682242990654204, "grad_norm": 0.6165079474449158, "kl": 0.05256923660635948, "learning_rate": 1.136875e-06, "loss": 0.0034, "num_tokens": 181154555.0, "reward": 1.5177680253982544, "reward_std": 0.0491989366710186, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5177680253982544, "rewards/correct_reward_func/std": 0.20368434488773346, "step": 1392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 1341.7381591796875, "completions/mean_terminated_length": 1259.2047119140625, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 2.1697819314641746, "grad_norm": 0.6176717877388, "kl": 0.05058910325169563, "learning_rate": 1.13625e-06, "loss": 0.0389, "num_tokens": 181273141.0, "reward": 1.586047649383545, "reward_std": 0.08208438009023666, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5860474109649658, "rewards/correct_reward_func/std": 0.15453560650348663, "step": 1393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1396.1785888671875, "completions/mean_terminated_length": 1396.1785888671875, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 2.1713395638629285, "grad_norm": 0.6619312167167664, "kl": 0.05586876720190048, "learning_rate": 1.135625e-06, "loss": -0.0053, "num_tokens": 181396504.0, "reward": 1.4958618879318237, "reward_std": 0.05254824087023735, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4958617687225342, "rewards/correct_reward_func/std": 0.15852178633213043, "step": 1394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1467.9881591796875, "completions/mean_terminated_length": 1386.975830078125, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 2.1728971962616823, "grad_norm": 0.6041256189346313, "kl": 0.050383489578962326, "learning_rate": 1.135e-06, "loss": 0.0677, "num_tokens": 181526115.0, "reward": 1.5440009832382202, "reward_std": 0.09470544755458832, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5559054613113403, "rewards/correct_reward_func/std": 0.15988558530807495, "step": 1395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 1405.761962890625, "completions/mean_terminated_length": 1405.761962890625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 2.174454828660436, "grad_norm": 0.5396268367767334, "kl": 0.05292073078453541, "learning_rate": 1.1343749999999998e-06, "loss": 0.0162, "num_tokens": 181650289.0, "reward": 1.5595359802246094, "reward_std": 0.0661059319972992, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5595358610153198, "rewards/correct_reward_func/std": 0.12540122866630554, "step": 1396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 1430.4405517578125, "completions/mean_terminated_length": 1430.4405517578125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 2.17601246105919, "grad_norm": 0.619700014591217, "kl": 0.053215889260172844, "learning_rate": 1.13375e-06, "loss": 0.0011, "num_tokens": 181776728.0, "reward": 1.4992388486862183, "reward_std": 0.06807240098714828, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4992387890815735, "rewards/correct_reward_func/std": 0.14371387660503387, "step": 1397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1809.0, "completions/max_terminated_length": 1809.0, "completions/mean_length": 1308.8333740234375, "completions/mean_terminated_length": 1308.8333740234375, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "epoch": 2.177570093457944, "grad_norm": 0.647262454032898, "kl": 0.055274598300457, "learning_rate": 1.1331249999999998e-06, "loss": -0.0053, "num_tokens": 181892442.0, "reward": 1.509229063987732, "reward_std": 0.0441347099840641, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5092291235923767, "rewards/correct_reward_func/std": 0.17118947207927704, "step": 1398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 1333.916748046875, "completions/mean_terminated_length": 1333.916748046875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 2.1791277258566977, "grad_norm": 0.6675759553909302, "kl": 0.05312488600611687, "learning_rate": 1.1325e-06, "loss": 0.0102, "num_tokens": 182010503.0, "reward": 1.5279139280319214, "reward_std": 0.11956489831209183, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5517235398292542, "rewards/correct_reward_func/std": 0.14841613173484802, "step": 1399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 1368.71435546875, "completions/mean_terminated_length": 1368.71435546875, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 2.1806853582554515, "grad_norm": 0.6055256128311157, "kl": 0.05360832065343857, "learning_rate": 1.1318749999999999e-06, "loss": -0.0003, "num_tokens": 182131655.0, "reward": 1.5209357738494873, "reward_std": 0.056754305958747864, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5209357142448425, "rewards/correct_reward_func/std": 0.11721338331699371, "step": 1400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1889.0, "completions/max_terminated_length": 1889.0, "completions/mean_length": 1344.761962890625, "completions/mean_terminated_length": 1344.761962890625, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 2.182242990654206, "grad_norm": 0.6152173280715942, "kl": 0.05152701400220394, "learning_rate": 1.13125e-06, "loss": 0.0225, "num_tokens": 182250633.0, "reward": 1.5644099712371826, "reward_std": 0.049041010439395905, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5644097328186035, "rewards/correct_reward_func/std": 0.13315775990486145, "step": 1401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2539.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 1262.9405517578125, "completions/mean_terminated_length": 1262.9405517578125, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 2.1838006230529596, "grad_norm": 0.6306037306785583, "kl": 0.05243268795311451, "learning_rate": 1.1306249999999999e-06, "loss": -0.0288, "num_tokens": 182362582.0, "reward": 1.5457426309585571, "reward_std": 0.07910532504320145, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5576474666595459, "rewards/correct_reward_func/std": 0.17634469270706177, "step": 1402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 1336.011962890625, "completions/mean_terminated_length": 1336.011962890625, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 2.1853582554517135, "grad_norm": 0.6187497973442078, "kl": 0.05358815938234329, "learning_rate": 1.1299999999999998e-06, "loss": -0.0088, "num_tokens": 182480837.0, "reward": 1.4827290773391724, "reward_std": 0.07690685242414474, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48272886872291565, "rewards/correct_reward_func/std": 0.15018264949321747, "step": 1403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1353.1785888671875, "completions/mean_terminated_length": 1353.1785888671875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.1869158878504673, "grad_norm": 0.6150205731391907, "kl": 0.05179154872894287, "learning_rate": 1.1293749999999999e-06, "loss": 0.0224, "num_tokens": 182600498.0, "reward": 1.5574157238006592, "reward_std": 0.049491725862026215, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5574156641960144, "rewards/correct_reward_func/std": 0.21539902687072754, "step": 1404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1363.40478515625, "completions/mean_terminated_length": 1281.1324462890625, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 2.188473520249221, "grad_norm": 0.5901775360107422, "kl": 0.05035247094929218, "learning_rate": 1.1287499999999998e-06, "loss": 0.076, "num_tokens": 182720970.0, "reward": 1.5253045558929443, "reward_std": 0.0753924772143364, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5253044962882996, "rewards/correct_reward_func/std": 0.18221648037433624, "step": 1405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 1394.6785888671875, "completions/mean_terminated_length": 1312.7830810546875, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "epoch": 2.190031152647975, "grad_norm": 1.4934645891189575, "kl": 0.09698312915861607, "learning_rate": 1.1281249999999999e-06, "loss": 0.0679, "num_tokens": 182843913.0, "reward": 1.5091545581817627, "reward_std": 0.06092026084661484, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5091544985771179, "rewards/correct_reward_func/std": 0.17217372357845306, "step": 1406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1891.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 1341.40478515625, "completions/mean_terminated_length": 1341.40478515625, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 2.191588785046729, "grad_norm": 0.6217133402824402, "kl": 0.05375100485980511, "learning_rate": 1.1274999999999998e-06, "loss": -0.0054, "num_tokens": 182962633.0, "reward": 1.5640146732330322, "reward_std": 0.08299466967582703, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5759192705154419, "rewards/correct_reward_func/std": 0.20054613053798676, "step": 1407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1407.90478515625, "completions/mean_terminated_length": 1326.1685791015625, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.1931464174454827, "grad_norm": 0.5736817121505737, "kl": 0.050758618861436844, "learning_rate": 1.1268750000000001e-06, "loss": 0.0179, "num_tokens": 183086939.0, "reward": 1.5700607299804688, "reward_std": 0.060461703687906265, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5700604319572449, "rewards/correct_reward_func/std": 0.17380861937999725, "step": 1408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 1334.8095703125, "completions/mean_terminated_length": 1334.8095703125, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 2.194704049844237, "grad_norm": 0.6261428594589233, "kl": 0.05368942767381668, "learning_rate": 1.12625e-06, "loss": -0.0284, "num_tokens": 183205219.0, "reward": 1.5286486148834229, "reward_std": 0.04812389984726906, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5286486148834229, "rewards/correct_reward_func/std": 0.1558607965707779, "step": 1409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 1358.71435546875, "completions/mean_terminated_length": 1358.71435546875, "completions/min_length": 722.0, "completions/min_terminated_length": 722.0, "epoch": 2.196261682242991, "grad_norm": 0.6098248362541199, "kl": 0.05205678194761276, "learning_rate": 1.1256250000000001e-06, "loss": 0.0024, "num_tokens": 183325399.0, "reward": 1.4650918245315552, "reward_std": 0.0883210301399231, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4769965708255768, "rewards/correct_reward_func/std": 0.13771669566631317, "step": 1410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 1276.0833740234375, "completions/mean_terminated_length": 1276.0833740234375, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 2.1978193146417446, "grad_norm": 0.6695262789726257, "kl": 0.054968561977148056, "learning_rate": 1.125e-06, "loss": -0.007, "num_tokens": 183438512.0, "reward": 1.4597842693328857, "reward_std": 0.10109290480613708, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4716888964176178, "rewards/correct_reward_func/std": 0.18217721581459045, "step": 1411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 1396.892822265625, "completions/mean_terminated_length": 1396.892822265625, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 2.1993769470404985, "grad_norm": 0.6523039937019348, "kl": 0.05474048666656017, "learning_rate": 1.124375e-06, "loss": 0.0138, "num_tokens": 183561875.0, "reward": 1.5459095239639282, "reward_std": 0.06007232144474983, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.545909583568573, "rewards/correct_reward_func/std": 0.1700262576341629, "step": 1412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1314.6905517578125, "completions/mean_terminated_length": 1314.6905517578125, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 2.2009345794392523, "grad_norm": 0.606766402721405, "kl": 0.0526558943092823, "learning_rate": 1.12375e-06, "loss": -0.0103, "num_tokens": 183678219.0, "reward": 1.4738417863845825, "reward_std": 0.056224629282951355, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4738418161869049, "rewards/correct_reward_func/std": 0.1811302900314331, "step": 1413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2152.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 1349.5, "completions/mean_terminated_length": 1349.5, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 2.202492211838006, "grad_norm": 0.615530788898468, "kl": 0.05169066973030567, "learning_rate": 1.123125e-06, "loss": 0.0266, "num_tokens": 183797481.0, "reward": 1.5107375383377075, "reward_std": 0.05849654600024223, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5107373595237732, "rewards/correct_reward_func/std": 0.2403155118227005, "step": 1414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 1376.3690185546875, "completions/mean_terminated_length": 1376.3690185546875, "completions/min_length": 446.0, "completions/min_terminated_length": 446.0, "epoch": 2.20404984423676, "grad_norm": 0.5918955206871033, "kl": 0.052717624232172966, "learning_rate": 1.1225e-06, "loss": 0.0117, "num_tokens": 183919240.0, "reward": 1.4787405729293823, "reward_std": 0.047720830887556076, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4787404537200928, "rewards/correct_reward_func/std": 0.1984664648771286, "step": 1415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2361.0, "completions/max_terminated_length": 2361.0, "completions/mean_length": 1332.6309814453125, "completions/mean_terminated_length": 1332.6309814453125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 2.205607476635514, "grad_norm": 0.6312891244888306, "kl": 0.051543304696679115, "learning_rate": 1.121875e-06, "loss": -0.0232, "num_tokens": 184037229.0, "reward": 1.4701073169708252, "reward_std": 0.05198419839143753, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47010722756385803, "rewards/correct_reward_func/std": 0.13098910450935364, "step": 1416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1403.3095703125, "completions/mean_terminated_length": 1403.3095703125, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "epoch": 2.207165109034268, "grad_norm": 0.5982795357704163, "kl": 0.05091387219727039, "learning_rate": 1.12125e-06, "loss": -0.0126, "num_tokens": 184161161.0, "reward": 1.4681129455566406, "reward_std": 0.06841563433408737, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4681129455566406, "rewards/correct_reward_func/std": 0.11516620218753815, "step": 1417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2140.0, "completions/max_terminated_length": 2140.0, "completions/mean_length": 1339.642822265625, "completions/mean_terminated_length": 1339.642822265625, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 2.208722741433022, "grad_norm": 0.6313459277153015, "kl": 0.053987784311175346, "learning_rate": 1.120625e-06, "loss": 0.0308, "num_tokens": 184279481.0, "reward": 1.5264147520065308, "reward_std": 0.04343011975288391, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5264146327972412, "rewards/correct_reward_func/std": 0.14351478219032288, "step": 1418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 1428.4761962890625, "completions/mean_terminated_length": 1346.9879150390625, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 2.210280373831776, "grad_norm": 0.6099157333374023, "kl": 0.050739964470267296, "learning_rate": 1.12e-06, "loss": 0.0626, "num_tokens": 184405431.0, "reward": 1.438481330871582, "reward_std": 0.12231256812810898, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4622907042503357, "rewards/correct_reward_func/std": 0.14225439727306366, "step": 1419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 1422.702392578125, "completions/mean_terminated_length": 1341.14453125, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 2.2118380062305296, "grad_norm": 0.629956841468811, "kl": 0.04776131920516491, "learning_rate": 1.119375e-06, "loss": 0.105, "num_tokens": 184530938.0, "reward": 1.460375189781189, "reward_std": 0.11942674964666367, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.48418471217155457, "rewards/correct_reward_func/std": 0.14827889204025269, "step": 1420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2035.0, "completions/max_terminated_length": 2035.0, "completions/mean_length": 1397.8095703125, "completions/mean_terminated_length": 1397.8095703125, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 2.2133956386292835, "grad_norm": 0.5877498984336853, "kl": 0.050654979422688484, "learning_rate": 1.1187499999999999e-06, "loss": -0.0129, "num_tokens": 184654510.0, "reward": 1.4832011461257935, "reward_std": 0.06599172949790955, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4832010269165039, "rewards/correct_reward_func/std": 0.1746063381433487, "step": 1421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1991.0, "completions/max_terminated_length": 1991.0, "completions/mean_length": 1373.8809814453125, "completions/mean_terminated_length": 1373.8809814453125, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 2.2149532710280373, "grad_norm": 0.6204925775527954, "kl": 0.051009414717555046, "learning_rate": 1.118125e-06, "loss": -0.0083, "num_tokens": 184776006.0, "reward": 1.471339464187622, "reward_std": 0.0575740784406662, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47133955359458923, "rewards/correct_reward_func/std": 0.11522329598665237, "step": 1422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1395.607177734375, "completions/mean_terminated_length": 1395.607177734375, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 2.216510903426791, "grad_norm": 0.6099293828010559, "kl": 0.05443691648542881, "learning_rate": 1.1174999999999999e-06, "loss": -0.0002, "num_tokens": 184899189.0, "reward": 1.4974089860916138, "reward_std": 0.052442971616983414, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49740901589393616, "rewards/correct_reward_func/std": 0.16189044713974, "step": 1423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2582.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 1374.34521484375, "completions/mean_terminated_length": 1374.34521484375, "completions/min_length": 984.0, "completions/min_terminated_length": 984.0, "epoch": 2.218068535825545, "grad_norm": 0.6183286905288696, "kl": 0.05184337683022022, "learning_rate": 1.116875e-06, "loss": 0.0116, "num_tokens": 185020580.0, "reward": 1.4078788757324219, "reward_std": 0.10648416727781296, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.4435930848121643, "rewards/correct_reward_func/std": 0.11820869892835617, "step": 1424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 1395.1785888671875, "completions/mean_terminated_length": 1395.1785888671875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 2.2196261682242993, "grad_norm": 0.588648796081543, "kl": 0.05046474002301693, "learning_rate": 1.11625e-06, "loss": 0.0156, "num_tokens": 185143577.0, "reward": 1.5451319217681885, "reward_std": 0.08950678259134293, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5451318621635437, "rewards/correct_reward_func/std": 0.15927232801914215, "step": 1425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 1437.8809814453125, "completions/mean_terminated_length": 1437.8809814453125, "completions/min_length": 962.0, "completions/min_terminated_length": 962.0, "epoch": 2.221183800623053, "grad_norm": 0.5990937352180481, "kl": 0.05080426298081875, "learning_rate": 1.115625e-06, "loss": 0.0144, "num_tokens": 185270545.0, "reward": 1.4890046119689941, "reward_std": 0.06869736313819885, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48900458216667175, "rewards/correct_reward_func/std": 0.12993039190769196, "step": 1426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1956.0, "completions/max_terminated_length": 1956.0, "completions/mean_length": 1344.3690185546875, "completions/mean_terminated_length": 1344.3690185546875, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 2.222741433021807, "grad_norm": 0.5936654806137085, "kl": 0.049339067190885544, "learning_rate": 1.115e-06, "loss": -0.0041, "num_tokens": 185389370.0, "reward": 1.5378203392028809, "reward_std": 0.06178745999932289, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5378202795982361, "rewards/correct_reward_func/std": 0.14922647178173065, "step": 1427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1975.0, "completions/max_terminated_length": 1975.0, "completions/mean_length": 1386.952392578125, "completions/mean_terminated_length": 1386.952392578125, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "epoch": 2.2242990654205608, "grad_norm": 0.6407909989356995, "kl": 0.05040920153260231, "learning_rate": 1.1143749999999998e-06, "loss": 0.0001, "num_tokens": 185512108.0, "reward": 1.5220898389816284, "reward_std": 0.06172652542591095, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5220896601676941, "rewards/correct_reward_func/std": 0.15288163721561432, "step": 1428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2539.0, "completions/max_terminated_length": 2539.0, "completions/mean_length": 1378.0, "completions/mean_terminated_length": 1378.0, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "epoch": 2.2258566978193146, "grad_norm": 0.6217040419578552, "kl": 0.04973181150853634, "learning_rate": 1.11375e-06, "loss": -0.0146, "num_tokens": 185633824.0, "reward": 1.4893889427185059, "reward_std": 0.047641631215810776, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48938891291618347, "rewards/correct_reward_func/std": 0.10297294706106186, "step": 1429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1838.0, "completions/mean_length": 1413.0833740234375, "completions/mean_terminated_length": 1331.4095458984375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 2.2274143302180685, "grad_norm": 0.5969963669776917, "kl": 0.05029815621674061, "learning_rate": 1.1131249999999998e-06, "loss": 0.0488, "num_tokens": 185758469.0, "reward": 1.4731628894805908, "reward_std": 0.07649555057287216, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47316280007362366, "rewards/correct_reward_func/std": 0.16156095266342163, "step": 1430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1401.952392578125, "completions/mean_terminated_length": 1401.952392578125, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "epoch": 2.2289719626168223, "grad_norm": 0.5829575061798096, "kl": 0.048983871936798096, "learning_rate": 1.1125e-06, "loss": 0.0108, "num_tokens": 185882311.0, "reward": 1.4685138463974, "reward_std": 0.05959363281726837, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46851375699043274, "rewards/correct_reward_func/std": 0.13735638558864594, "step": 1431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2721.0, "completions/max_terminated_length": 2721.0, "completions/mean_length": 1422.96435546875, "completions/mean_terminated_length": 1422.96435546875, "completions/min_length": 973.0, "completions/min_terminated_length": 973.0, "epoch": 2.230529595015576, "grad_norm": 0.583566427230835, "kl": 0.05069119483232498, "learning_rate": 1.1118749999999998e-06, "loss": 0.0524, "num_tokens": 186007882.0, "reward": 1.4502121210098267, "reward_std": 0.06919793784618378, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4621167778968811, "rewards/correct_reward_func/std": 0.11339413374662399, "step": 1432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1453.9881591796875, "completions/mean_terminated_length": 1453.9881591796875, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "epoch": 2.2320872274143304, "grad_norm": 0.5851161479949951, "kl": 0.05229689180850983, "learning_rate": 1.11125e-06, "loss": -0.0107, "num_tokens": 186135975.0, "reward": 1.437343955039978, "reward_std": 0.062042396515607834, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43734386563301086, "rewards/correct_reward_func/std": 0.10962171107530594, "step": 1433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 1392.7857666015625, "completions/mean_terminated_length": 1392.7857666015625, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 2.2336448598130842, "grad_norm": 0.5569678544998169, "kl": 0.05025433748960495, "learning_rate": 1.1106249999999998e-06, "loss": -0.0193, "num_tokens": 186259161.0, "reward": 1.4958064556121826, "reward_std": 0.061311110854148865, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4958062767982483, "rewards/correct_reward_func/std": 0.16076527535915375, "step": 1434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3081.0, "completions/mean_length": 1492.6309814453125, "completions/mean_terminated_length": 1411.9156494140625, "completions/min_length": 952.0, "completions/min_terminated_length": 952.0, "epoch": 2.235202492211838, "grad_norm": 0.5863468050956726, "kl": 0.05004304647445679, "learning_rate": 1.11e-06, "loss": 0.0124, "num_tokens": 186390398.0, "reward": 1.4537379741668701, "reward_std": 0.08780191093683243, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4656427502632141, "rewards/correct_reward_func/std": 0.1634255200624466, "step": 1435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2005.0, "completions/max_terminated_length": 2005.0, "completions/mean_length": 1455.6190185546875, "completions/mean_terminated_length": 1455.6190185546875, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 2.236760124610592, "grad_norm": 0.5803380608558655, "kl": 0.05177968554198742, "learning_rate": 1.1093749999999999e-06, "loss": 0.0017, "num_tokens": 186518622.0, "reward": 1.4930886030197144, "reward_std": 0.0740039274096489, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5049933195114136, "rewards/correct_reward_func/std": 0.11568497121334076, "step": 1436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3717.0, "completions/max_terminated_length": 3717.0, "completions/mean_length": 1450.9405517578125, "completions/mean_terminated_length": 1450.9405517578125, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "epoch": 2.2383177570093458, "grad_norm": 0.5814592838287354, "kl": 0.05251036770641804, "learning_rate": 1.1087499999999998e-06, "loss": 0.0059, "num_tokens": 186646321.0, "reward": 1.489889144897461, "reward_std": 0.06766058504581451, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4898890554904938, "rewards/correct_reward_func/std": 0.15255028009414673, "step": 1437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2143.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 1407.952392578125, "completions/mean_terminated_length": 1407.952392578125, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 2.2398753894080996, "grad_norm": 0.6263703107833862, "kl": 0.05063655413687229, "learning_rate": 1.1081249999999999e-06, "loss": 0.0463, "num_tokens": 186770505.0, "reward": 1.508715271949768, "reward_std": 0.06604710966348648, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5087152123451233, "rewards/correct_reward_func/std": 0.15267281234264374, "step": 1438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2793.0, "completions/max_terminated_length": 2793.0, "completions/mean_length": 1470.21435546875, "completions/mean_terminated_length": 1470.21435546875, "completions/min_length": 999.0, "completions/min_terminated_length": 999.0, "epoch": 2.2414330218068534, "grad_norm": 0.563240110874176, "kl": 0.050285084173083305, "learning_rate": 1.1075e-06, "loss": -0.0135, "num_tokens": 186900027.0, "reward": 1.536528468132019, "reward_std": 0.06567844748497009, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5365285277366638, "rewards/correct_reward_func/std": 0.12096237391233444, "step": 1439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 1417.4285888671875, "completions/mean_terminated_length": 1417.4285888671875, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 2.2429906542056073, "grad_norm": 0.5895605087280273, "kl": 0.05225484073162079, "learning_rate": 1.106875e-06, "loss": -0.0179, "num_tokens": 187024941.0, "reward": 1.4472278356552124, "reward_std": 0.06428219377994537, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4472276568412781, "rewards/correct_reward_func/std": 0.11170006543397903, "step": 1440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 1515.2381591796875, "completions/mean_terminated_length": 1434.795166015625, "completions/min_length": 582.0, "completions/min_terminated_length": 582.0, "epoch": 2.2445482866043616, "grad_norm": 0.5636044144630432, "kl": 0.049497274681925774, "learning_rate": 1.10625e-06, "loss": 0.0453, "num_tokens": 187158167.0, "reward": 1.436218023300171, "reward_std": 0.053663164377212524, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43621793389320374, "rewards/correct_reward_func/std": 0.1188172921538353, "step": 1441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 1426.5595703125, "completions/mean_terminated_length": 1426.5595703125, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "epoch": 2.2461059190031154, "grad_norm": 0.5886960625648499, "kl": 0.051049694418907166, "learning_rate": 1.1056250000000001e-06, "loss": 0.0056, "num_tokens": 187283872.0, "reward": 1.4949028491973877, "reward_std": 0.06216200441122055, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49490275979042053, "rewards/correct_reward_func/std": 0.15425625443458557, "step": 1442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5254.0, "completions/max_terminated_length": 5254.0, "completions/mean_length": 1444.1429443359375, "completions/mean_terminated_length": 1444.1429443359375, "completions/min_length": 1002.0, "completions/min_terminated_length": 1002.0, "epoch": 2.2476635514018692, "grad_norm": 0.5931892991065979, "kl": 0.053183846175670624, "learning_rate": 1.105e-06, "loss": 0.0266, "num_tokens": 187410964.0, "reward": 1.462154746055603, "reward_std": 0.05214592069387436, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4621546268463135, "rewards/correct_reward_func/std": 0.13468529284000397, "step": 1443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1555.7857666015625, "completions/mean_terminated_length": 1555.7857666015625, "completions/min_length": 544.0, "completions/min_terminated_length": 544.0, "epoch": 2.249221183800623, "grad_norm": 0.5923489928245544, "kl": 0.05316341482102871, "learning_rate": 1.1043750000000001e-06, "loss": 0.0072, "num_tokens": 187548052.0, "reward": 1.491997480392456, "reward_std": 0.05244386941194534, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4919973909854889, "rewards/correct_reward_func/std": 0.1004580557346344, "step": 1444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1490.0238037109375, "completions/mean_terminated_length": 1409.277099609375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 2.250778816199377, "grad_norm": 0.5598641037940979, "kl": 0.05210278742015362, "learning_rate": 1.10375e-06, "loss": 0.0749, "num_tokens": 187679166.0, "reward": 1.402806043624878, "reward_std": 0.060150109231472015, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.40280595421791077, "rewards/correct_reward_func/std": 0.16774208843708038, "step": 1445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 1464.3690185546875, "completions/mean_terminated_length": 1464.3690185546875, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "epoch": 2.2523364485981308, "grad_norm": 0.6024284362792969, "kl": 0.05357578210532665, "learning_rate": 1.103125e-06, "loss": -0.0216, "num_tokens": 187808329.0, "reward": 1.492249608039856, "reward_std": 0.07721719145774841, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5041544437408447, "rewards/correct_reward_func/std": 0.1484045684337616, "step": 1446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 1463.4285888671875, "completions/mean_terminated_length": 1463.4285888671875, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 2.2538940809968846, "grad_norm": 0.6115955710411072, "kl": 0.0518728569149971, "learning_rate": 1.1025e-06, "loss": -0.0283, "num_tokens": 187937059.0, "reward": 1.5513789653778076, "reward_std": 0.05571288987994194, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5513787269592285, "rewards/correct_reward_func/std": 0.18284963071346283, "step": 1447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1464.047607421875, "completions/mean_terminated_length": 1464.047607421875, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 2.2554517133956384, "grad_norm": 0.5737836956977844, "kl": 0.052882302552461624, "learning_rate": 1.101875e-06, "loss": -0.0071, "num_tokens": 188065967.0, "reward": 1.499729037284851, "reward_std": 0.06153211370110512, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4997289776802063, "rewards/correct_reward_func/std": 0.1549866646528244, "step": 1448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 1443.297607421875, "completions/mean_terminated_length": 1443.297607421875, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 2.2570093457943923, "grad_norm": 0.5629928112030029, "kl": 0.053539082407951355, "learning_rate": 1.10125e-06, "loss": -0.0086, "num_tokens": 188193186.0, "reward": 1.5459725856781006, "reward_std": 0.05502960830926895, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5459725856781006, "rewards/correct_reward_func/std": 0.19756190478801727, "step": 1449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2353.0, "completions/max_terminated_length": 2353.0, "completions/mean_length": 1426.09521484375, "completions/mean_terminated_length": 1426.09521484375, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "epoch": 2.2585669781931466, "grad_norm": 0.5836719274520874, "kl": 0.05398919619619846, "learning_rate": 1.100625e-06, "loss": 0.0198, "num_tokens": 188319236.0, "reward": 1.4717330932617188, "reward_std": 0.05614446476101875, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47173309326171875, "rewards/correct_reward_func/std": 0.18538041412830353, "step": 1450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1482.702392578125, "completions/mean_terminated_length": 1482.702392578125, "completions/min_length": 988.0, "completions/min_terminated_length": 988.0, "epoch": 2.2601246105919004, "grad_norm": 0.6005244255065918, "kl": 0.05629855394363403, "learning_rate": 1.1e-06, "loss": -0.0099, "num_tokens": 188449747.0, "reward": 1.397466778755188, "reward_std": 0.11398248374462128, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4212762415409088, "rewards/correct_reward_func/std": 0.12703315913677216, "step": 1451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 1465.0357666015625, "completions/mean_terminated_length": 1383.9879150390625, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 2.2616822429906542, "grad_norm": 0.6022769808769226, "kl": 0.05183589272201061, "learning_rate": 1.099375e-06, "loss": 0.0467, "num_tokens": 188578624.0, "reward": 1.5282100439071655, "reward_std": 0.07352463155984879, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5282100439071655, "rewards/correct_reward_func/std": 0.16962075233459473, "step": 1452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2016.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1502.3929443359375, "completions/mean_terminated_length": 1502.3929443359375, "completions/min_length": 1061.0, "completions/min_terminated_length": 1061.0, "epoch": 2.263239875389408, "grad_norm": 0.5998267531394958, "kl": 0.055121367797255516, "learning_rate": 1.0987499999999999e-06, "loss": -0.0062, "num_tokens": 188710963.0, "reward": 1.5209311246871948, "reward_std": 0.0762595385313034, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5328359007835388, "rewards/correct_reward_func/std": 0.11844736337661743, "step": 1453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2456.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 1541.202392578125, "completions/mean_terminated_length": 1541.202392578125, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 2.264797507788162, "grad_norm": 0.5792635679244995, "kl": 0.0507790707051754, "learning_rate": 1.098125e-06, "loss": 0.024, "num_tokens": 188846610.0, "reward": 1.4762332439422607, "reward_std": 0.09584526717662811, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5000427961349487, "rewards/correct_reward_func/std": 0.17316730320453644, "step": 1454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2339.0, "completions/max_terminated_length": 2339.0, "completions/mean_length": 1495.3690185546875, "completions/mean_terminated_length": 1495.3690185546875, "completions/min_length": 761.0, "completions/min_terminated_length": 761.0, "epoch": 2.2663551401869158, "grad_norm": 0.5584580302238464, "kl": 0.054367583245038986, "learning_rate": 1.0974999999999999e-06, "loss": -0.0132, "num_tokens": 188978203.0, "reward": 1.5052847862243652, "reward_std": 0.0511532798409462, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5052847266197205, "rewards/correct_reward_func/std": 0.1431160867214203, "step": 1455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1476.5, "completions/mean_terminated_length": 1476.5, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "epoch": 2.2679127725856696, "grad_norm": 0.6262014508247375, "kl": 0.05148451402783394, "learning_rate": 1.096875e-06, "loss": -0.0127, "num_tokens": 189108277.0, "reward": 1.5883116722106934, "reward_std": 0.04737496376037598, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5883118510246277, "rewards/correct_reward_func/std": 0.1377226859331131, "step": 1456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2733.0, "completions/max_terminated_length": 2733.0, "completions/mean_length": 1517.2261962890625, "completions/mean_terminated_length": 1517.2261962890625, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 2.269470404984424, "grad_norm": 0.5892953872680664, "kl": 0.05538878217339516, "learning_rate": 1.0962499999999999e-06, "loss": -0.0001, "num_tokens": 189241904.0, "reward": 1.4924298524856567, "reward_std": 0.0795227438211441, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.504334568977356, "rewards/correct_reward_func/std": 0.13135887682437897, "step": 1457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 1531.5833740234375, "completions/mean_terminated_length": 1531.5833740234375, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "epoch": 2.2710280373831777, "grad_norm": 0.5622796416282654, "kl": 0.05421471409499645, "learning_rate": 1.095625e-06, "loss": -0.0275, "num_tokens": 189376653.0, "reward": 1.5853981971740723, "reward_std": 0.06207668408751488, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5853981375694275, "rewards/correct_reward_func/std": 0.14206776022911072, "step": 1458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 1476.952392578125, "completions/mean_terminated_length": 1476.952392578125, "completions/min_length": 31.0, "completions/min_terminated_length": 31.0, "epoch": 2.2725856697819315, "grad_norm": 0.5906720757484436, "kl": 0.056371189653873444, "learning_rate": 1.0949999999999999e-06, "loss": -0.0422, "num_tokens": 189506651.0, "reward": 1.460417628288269, "reward_std": 0.09250402450561523, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4723222851753235, "rewards/correct_reward_func/std": 0.1680890917778015, "step": 1459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2865.0, "completions/max_terminated_length": 2865.0, "completions/mean_length": 1558.1190185546875, "completions/mean_terminated_length": 1558.1190185546875, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 2.2741433021806854, "grad_norm": 0.5651521682739258, "kl": 0.05462817847728729, "learning_rate": 1.094375e-06, "loss": 0.0157, "num_tokens": 189643491.0, "reward": 1.5475213527679443, "reward_std": 0.06007247418165207, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5475212931632996, "rewards/correct_reward_func/std": 0.15246212482452393, "step": 1460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2584.0, "completions/max_terminated_length": 2584.0, "completions/mean_length": 1613.3333740234375, "completions/mean_terminated_length": 1613.3333740234375, "completions/min_length": 991.0, "completions/min_terminated_length": 991.0, "epoch": 2.2757009345794392, "grad_norm": 0.5806359052658081, "kl": 0.05488669499754906, "learning_rate": 1.09375e-06, "loss": 0.0108, "num_tokens": 189785215.0, "reward": 1.5184024572372437, "reward_std": 0.07876469194889069, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5303071737289429, "rewards/correct_reward_func/std": 0.12674874067306519, "step": 1461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 1658.261962890625, "completions/mean_terminated_length": 1579.5421142578125, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "epoch": 2.277258566978193, "grad_norm": 0.520882248878479, "kl": 0.0543929822742939, "learning_rate": 1.0931249999999998e-06, "loss": 0.0524, "num_tokens": 189930515.0, "reward": 1.5174694061279297, "reward_std": 0.10750548541545868, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5293741822242737, "rewards/correct_reward_func/std": 0.1827918291091919, "step": 1462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2771.0, "completions/max_terminated_length": 2771.0, "completions/mean_length": 1523.047607421875, "completions/mean_terminated_length": 1523.047607421875, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "epoch": 2.278816199376947, "grad_norm": 0.5634313821792603, "kl": 0.05285234749317169, "learning_rate": 1.0925e-06, "loss": 0.0198, "num_tokens": 190064367.0, "reward": 1.5356688499450684, "reward_std": 0.06631126999855042, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5475736260414124, "rewards/correct_reward_func/std": 0.21322400867938995, "step": 1463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2513.0, "completions/max_terminated_length": 2513.0, "completions/mean_length": 1620.21435546875, "completions/mean_terminated_length": 1620.21435546875, "completions/min_length": 1114.0, "completions/min_terminated_length": 1114.0, "epoch": 2.2803738317757007, "grad_norm": 0.5556662678718567, "kl": 0.053571244701743126, "learning_rate": 1.0918749999999998e-06, "loss": 0.0068, "num_tokens": 190206489.0, "reward": 1.5448968410491943, "reward_std": 0.05326950177550316, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5448968410491943, "rewards/correct_reward_func/std": 0.10366009920835495, "step": 1464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2237.0, "completions/max_terminated_length": 2237.0, "completions/mean_length": 1517.25, "completions/mean_terminated_length": 1517.25, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 2.2819314641744546, "grad_norm": 0.5898197293281555, "kl": 0.0547268632799387, "learning_rate": 1.09125e-06, "loss": 0.0004, "num_tokens": 190339842.0, "reward": 1.510769248008728, "reward_std": 0.12588609755039215, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.534578800201416, "rewards/correct_reward_func/std": 0.1230318695306778, "step": 1465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2494.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 1599.65478515625, "completions/mean_terminated_length": 1599.65478515625, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 2.283489096573209, "grad_norm": 0.5404325723648071, "kl": 0.0551269818097353, "learning_rate": 1.0906249999999998e-06, "loss": 0.0287, "num_tokens": 190480147.0, "reward": 1.4938769340515137, "reward_std": 0.05949753522872925, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49387696385383606, "rewards/correct_reward_func/std": 0.11247193068265915, "step": 1466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 1604.0714111328125, "completions/mean_terminated_length": 1524.69873046875, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 2.2850467289719627, "grad_norm": 0.5366068482398987, "kl": 0.05324495583772659, "learning_rate": 1.09e-06, "loss": 0.0651, "num_tokens": 190620739.0, "reward": 1.4881486892700195, "reward_std": 0.07596345245838165, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48814862966537476, "rewards/correct_reward_func/std": 0.14390142261981964, "step": 1467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 1623.25, "completions/mean_terminated_length": 1623.25, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 2.2866043613707165, "grad_norm": 0.5547800660133362, "kl": 0.052822647616267204, "learning_rate": 1.0893749999999998e-06, "loss": -0.0151, "num_tokens": 190763152.0, "reward": 1.5945003032684326, "reward_std": 0.0555381216108799, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5945003032684326, "rewards/correct_reward_func/std": 0.14755821228027344, "step": 1468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2674.0, "completions/max_terminated_length": 2674.0, "completions/mean_length": 1596.916748046875, "completions/mean_terminated_length": 1596.916748046875, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "epoch": 2.2881619937694704, "grad_norm": 0.6067368388175964, "kl": 0.05675346963107586, "learning_rate": 1.08875e-06, "loss": -0.0058, "num_tokens": 190903329.0, "reward": 1.4694623947143555, "reward_std": 0.04608185961842537, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4694623351097107, "rewards/correct_reward_func/std": 0.17574653029441833, "step": 1469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2530.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 1565.011962890625, "completions/mean_terminated_length": 1565.011962890625, "completions/min_length": 1040.0, "completions/min_terminated_length": 1040.0, "epoch": 2.289719626168224, "grad_norm": 0.5567640066146851, "kl": 0.05394000932574272, "learning_rate": 1.088125e-06, "loss": 0.0198, "num_tokens": 191040898.0, "reward": 1.4711778163909912, "reward_std": 0.039712656289339066, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4711777865886688, "rewards/correct_reward_func/std": 0.08949955552816391, "step": 1470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2425.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 1618.7261962890625, "completions/mean_terminated_length": 1618.7261962890625, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 2.291277258566978, "grad_norm": 0.586256742477417, "kl": 0.05433792620897293, "learning_rate": 1.0875e-06, "loss": 0.0116, "num_tokens": 191183135.0, "reward": 1.4506512880325317, "reward_std": 0.05018966645002365, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4506511092185974, "rewards/correct_reward_func/std": 0.10949051380157471, "step": 1471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2468.0, "completions/max_terminated_length": 2468.0, "completions/mean_length": 1508.6785888671875, "completions/mean_terminated_length": 1508.6785888671875, "completions/min_length": 9.0, "completions/min_terminated_length": 9.0, "epoch": 2.292834890965732, "grad_norm": 0.5424062609672546, "kl": 0.05285138823091984, "learning_rate": 1.086875e-06, "loss": -0.0299, "num_tokens": 191315690.0, "reward": 1.5331076383590698, "reward_std": 0.06884755939245224, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.533107578754425, "rewards/correct_reward_func/std": 0.2554265856742859, "step": 1472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6849.0, "completions/max_terminated_length": 6849.0, "completions/mean_length": 1657.9761962890625, "completions/mean_terminated_length": 1657.9761962890625, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 2.294392523364486, "grad_norm": 0.5311789512634277, "kl": 0.05233638174831867, "learning_rate": 1.08625e-06, "loss": 0.011, "num_tokens": 191461038.0, "reward": 1.5553193092346191, "reward_std": 0.07243891060352325, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5553191900253296, "rewards/correct_reward_func/std": 0.15028366446495056, "step": 1473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 1547.761962890625, "completions/mean_terminated_length": 1547.761962890625, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 2.29595015576324, "grad_norm": 0.57155442237854, "kl": 0.054684095084667206, "learning_rate": 1.085625e-06, "loss": 0.016, "num_tokens": 191596894.0, "reward": 1.5152174234390259, "reward_std": 0.0932183489203453, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5271221399307251, "rewards/correct_reward_func/std": 0.16357393562793732, "step": 1474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 1595.9881591796875, "completions/mean_terminated_length": 1595.9881591796875, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 2.297507788161994, "grad_norm": 0.5688794255256653, "kl": 0.05476943403482437, "learning_rate": 1.085e-06, "loss": -0.0002, "num_tokens": 191737065.0, "reward": 1.5350298881530762, "reward_std": 0.07860948890447617, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5469346046447754, "rewards/correct_reward_func/std": 0.1375960111618042, "step": 1475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2074.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 1568.3929443359375, "completions/mean_terminated_length": 1568.3929443359375, "completions/min_length": 1005.0, "completions/min_terminated_length": 1005.0, "epoch": 2.2990654205607477, "grad_norm": 0.587960958480835, "kl": 0.05632602423429489, "learning_rate": 1.084375e-06, "loss": 0.0218, "num_tokens": 191874858.0, "reward": 1.5271594524383545, "reward_std": 0.05088076740503311, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5271594524383545, "rewards/correct_reward_func/std": 0.1265942007303238, "step": 1476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 1548.8929443359375, "completions/mean_terminated_length": 1548.8929443359375, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "epoch": 2.3006230529595015, "grad_norm": 0.5741451978683472, "kl": 0.05568164400756359, "learning_rate": 1.08375e-06, "loss": 0.0093, "num_tokens": 192010947.0, "reward": 1.5878055095672607, "reward_std": 0.05139214172959328, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5878052711486816, "rewards/correct_reward_func/std": 0.15233469009399414, "step": 1477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1437.71435546875, "completions/mean_terminated_length": 1437.71435546875, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 2.3021806853582554, "grad_norm": 0.6467793583869934, "kl": 0.055395325645804405, "learning_rate": 1.083125e-06, "loss": 0.0011, "num_tokens": 192137415.0, "reward": 1.407796025276184, "reward_std": 0.04773819446563721, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4077959656715393, "rewards/correct_reward_func/std": 0.08783245086669922, "step": 1478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2664.0, "completions/max_terminated_length": 2664.0, "completions/mean_length": 1583.202392578125, "completions/mean_terminated_length": 1583.202392578125, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 2.303738317757009, "grad_norm": 0.5967252850532532, "kl": 0.052205393090844154, "learning_rate": 1.0825e-06, "loss": 0.0267, "num_tokens": 192276416.0, "reward": 1.4751042127609253, "reward_std": 0.04558471590280533, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4751041829586029, "rewards/correct_reward_func/std": 0.12813009321689606, "step": 1479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1523.357177734375, "completions/mean_terminated_length": 1523.357177734375, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 2.305295950155763, "grad_norm": 0.5554695129394531, "kl": 0.05453235283493996, "learning_rate": 1.081875e-06, "loss": 0.0211, "num_tokens": 192410354.0, "reward": 1.5111632347106934, "reward_std": 0.057304076850414276, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5111631751060486, "rewards/correct_reward_func/std": 0.17143847048282623, "step": 1480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3290.0, "completions/max_terminated_length": 3290.0, "completions/mean_length": 1554.047607421875, "completions/mean_terminated_length": 1554.047607421875, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 2.306853582554517, "grad_norm": 0.6291865706443787, "kl": 0.053843943402171135, "learning_rate": 1.08125e-06, "loss": 0.036, "num_tokens": 192546930.0, "reward": 1.4948196411132812, "reward_std": 0.07175532728433609, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4948195815086365, "rewards/correct_reward_func/std": 0.17542997002601624, "step": 1481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 1537.011962890625, "completions/mean_terminated_length": 1537.011962890625, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 2.308411214953271, "grad_norm": 0.5707323551177979, "kl": 0.05072789266705513, "learning_rate": 1.080625e-06, "loss": -0.004, "num_tokens": 192682027.0, "reward": 1.5787837505340576, "reward_std": 0.06122643128037453, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5787836313247681, "rewards/correct_reward_func/std": 0.14631566405296326, "step": 1482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1486.761962890625, "completions/mean_terminated_length": 1486.761962890625, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "epoch": 2.309968847352025, "grad_norm": 0.6203653812408447, "kl": 0.05305870249867439, "learning_rate": 1.08e-06, "loss": -0.0033, "num_tokens": 192812843.0, "reward": 1.4830313920974731, "reward_std": 0.07142604142427444, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49493607878685, "rewards/correct_reward_func/std": 0.18228484690189362, "step": 1483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2466.0, "completions/max_terminated_length": 2466.0, "completions/mean_length": 1556.34521484375, "completions/mean_terminated_length": 1556.34521484375, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 2.311526479750779, "grad_norm": 0.5840080380439758, "kl": 0.05211931839585304, "learning_rate": 1.079375e-06, "loss": 0.0028, "num_tokens": 192949468.0, "reward": 1.4683265686035156, "reward_std": 0.04387710988521576, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46832650899887085, "rewards/correct_reward_func/std": 0.1616368591785431, "step": 1484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2756.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 1544.8929443359375, "completions/mean_terminated_length": 1544.8929443359375, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 2.3130841121495327, "grad_norm": 0.591140866279602, "kl": 0.053639987483620644, "learning_rate": 1.07875e-06, "loss": -0.0171, "num_tokens": 193085281.0, "reward": 1.5192233324050903, "reward_std": 0.06582663953304291, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5192232728004456, "rewards/correct_reward_func/std": 0.16659405827522278, "step": 1485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 1475.21435546875, "completions/mean_terminated_length": 1475.21435546875, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 2.3146417445482865, "grad_norm": 0.5825228095054626, "kl": 0.05309503898024559, "learning_rate": 1.078125e-06, "loss": 0.0059, "num_tokens": 193215199.0, "reward": 1.5088545083999634, "reward_std": 0.06439197808504105, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.508854329586029, "rewards/correct_reward_func/std": 0.14637145400047302, "step": 1486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 1458.1309814453125, "completions/mean_terminated_length": 1458.1309814453125, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 2.3161993769470404, "grad_norm": 0.5589291453361511, "kl": 0.05132404714822769, "learning_rate": 1.0774999999999998e-06, "loss": 0.0117, "num_tokens": 193343574.0, "reward": 1.5080413818359375, "reward_std": 0.05234989896416664, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.508041501045227, "rewards/correct_reward_func/std": 0.11428192257881165, "step": 1487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2321.0, "completions/mean_length": 1712.547607421875, "completions/mean_terminated_length": 1554.5120849609375, "completions/min_length": 994.0, "completions/min_terminated_length": 994.0, "epoch": 2.317757009345794, "grad_norm": 0.5029699206352234, "kl": 0.047329457476735115, "learning_rate": 1.076875e-06, "loss": 0.1226, "num_tokens": 193493506.0, "reward": 1.5036362409591675, "reward_std": 0.11629515886306763, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5155408382415771, "rewards/correct_reward_func/std": 0.18734131753444672, "step": 1488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2322.0, "completions/max_terminated_length": 2322.0, "completions/mean_length": 1525.2381591796875, "completions/mean_terminated_length": 1525.2381591796875, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 2.3193146417445485, "grad_norm": 0.5806748270988464, "kl": 0.05009318143129349, "learning_rate": 1.0762499999999999e-06, "loss": 0.0298, "num_tokens": 193627686.0, "reward": 1.5326610803604126, "reward_std": 0.05039670318365097, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5326610803604126, "rewards/correct_reward_func/std": 0.10832948982715607, "step": 1489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1529.25, "completions/mean_terminated_length": 1529.25, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "epoch": 2.3208722741433023, "grad_norm": 0.5581504106521606, "kl": 0.05093400366604328, "learning_rate": 1.075625e-06, "loss": -0.0222, "num_tokens": 193761981.0, "reward": 1.5125718116760254, "reward_std": 0.05067034810781479, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5125716924667358, "rewards/correct_reward_func/std": 0.12764203548431396, "step": 1490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2169.0, "completions/max_terminated_length": 2169.0, "completions/mean_length": 1463.5357666015625, "completions/mean_terminated_length": 1463.5357666015625, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 2.322429906542056, "grad_norm": 0.6038565635681152, "kl": 0.052472831681370735, "learning_rate": 1.0749999999999999e-06, "loss": 0.0027, "num_tokens": 193890816.0, "reward": 1.4892951250076294, "reward_std": 0.14514635503292084, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.525009274482727, "rewards/correct_reward_func/std": 0.16362476348876953, "step": 1491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6434.0, "completions/max_terminated_length": 6434.0, "completions/mean_length": 1558.4761962890625, "completions/mean_terminated_length": 1558.4761962890625, "completions/min_length": 550.0, "completions/min_terminated_length": 550.0, "epoch": 2.32398753894081, "grad_norm": 0.5639177560806274, "kl": 0.048492105677723885, "learning_rate": 1.074375e-06, "loss": -0.0125, "num_tokens": 194027746.0, "reward": 1.51744544506073, "reward_std": 0.06054462864995003, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5174453854560852, "rewards/correct_reward_func/std": 0.14355380833148956, "step": 1492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3135.0, "completions/max_terminated_length": 3135.0, "completions/mean_length": 1519.2261962890625, "completions/mean_terminated_length": 1519.2261962890625, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 2.325545171339564, "grad_norm": 0.5945416688919067, "kl": 0.050016388297080994, "learning_rate": 1.0737499999999999e-06, "loss": -0.0258, "num_tokens": 194161439.0, "reward": 1.530950665473938, "reward_std": 0.0840780958533287, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5428553819656372, "rewards/correct_reward_func/std": 0.17042502760887146, "step": 1493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2241.0, "completions/max_terminated_length": 2241.0, "completions/mean_length": 1507.4285888671875, "completions/mean_terminated_length": 1507.4285888671875, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 2.3271028037383177, "grad_norm": 0.5898760557174683, "kl": 0.05393840745091438, "learning_rate": 1.073125e-06, "loss": 0.0118, "num_tokens": 194294219.0, "reward": 1.5262326002120972, "reward_std": 0.04770844429731369, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5262325406074524, "rewards/correct_reward_func/std": 0.11736533790826797, "step": 1494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2590.0, "completions/max_terminated_length": 2590.0, "completions/mean_length": 1530.4285888671875, "completions/mean_terminated_length": 1530.4285888671875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.3286604361370715, "grad_norm": 0.5714182257652283, "kl": 0.04990994744002819, "learning_rate": 1.0725e-06, "loss": -0.0099, "num_tokens": 194428685.0, "reward": 1.480570912361145, "reward_std": 0.07531038671731949, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.492475688457489, "rewards/correct_reward_func/std": 0.14578549563884735, "step": 1495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2575.0, "completions/max_terminated_length": 2575.0, "completions/mean_length": 1493.90478515625, "completions/mean_terminated_length": 1493.90478515625, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 2.3302180685358254, "grad_norm": 0.5452219247817993, "kl": 0.04878560081124306, "learning_rate": 1.0718749999999998e-06, "loss": 0.0112, "num_tokens": 194560065.0, "reward": 1.511672019958496, "reward_std": 0.06941808015108109, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5116719603538513, "rewards/correct_reward_func/std": 0.1337762475013733, "step": 1496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 1510.547607421875, "completions/mean_terminated_length": 1510.547607421875, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 2.331775700934579, "grad_norm": 0.5733957290649414, "kl": 0.052027832716703415, "learning_rate": 1.07125e-06, "loss": -0.0101, "num_tokens": 194693005.0, "reward": 1.5439540147781372, "reward_std": 0.059034314006567, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5439540147781372, "rewards/correct_reward_func/std": 0.10800313949584961, "step": 1497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 1511.5595703125, "completions/mean_terminated_length": 1511.5595703125, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 2.3333333333333335, "grad_norm": 0.5542396306991577, "kl": 0.05321688391268253, "learning_rate": 1.0706249999999998e-06, "loss": 0.0164, "num_tokens": 194825790.0, "reward": 1.5276340246200562, "reward_std": 0.09240320324897766, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5395386815071106, "rewards/correct_reward_func/std": 0.14749787747859955, "step": 1498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 1527.4881591796875, "completions/mean_terminated_length": 1527.4881591796875, "completions/min_length": 1046.0, "completions/min_terminated_length": 1046.0, "epoch": 2.3348909657320873, "grad_norm": 0.5923004150390625, "kl": 0.052399272099137306, "learning_rate": 1.07e-06, "loss": -0.0045, "num_tokens": 194960339.0, "reward": 1.4710288047790527, "reward_std": 0.05762951076030731, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4710286855697632, "rewards/correct_reward_func/std": 0.1321936696767807, "step": 1499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2445.0, "completions/max_terminated_length": 2445.0, "completions/mean_length": 1551.9405517578125, "completions/mean_terminated_length": 1551.9405517578125, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 2.336448598130841, "grad_norm": 0.5823187232017517, "kl": 0.051180312409996986, "learning_rate": 1.0693749999999998e-06, "loss": 0.0034, "num_tokens": 195096456.0, "reward": 1.4296586513519287, "reward_std": 0.05320543423295021, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4296586513519287, "rewards/correct_reward_func/std": 0.15993890166282654, "step": 1500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 1533.107177734375, "completions/mean_terminated_length": 1533.107177734375, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "epoch": 2.338006230529595, "grad_norm": 0.5223891735076904, "kl": 0.05084380693733692, "learning_rate": 1.0687500000000001e-06, "loss": -0.0302, "num_tokens": 195231249.0, "reward": 1.518973469734192, "reward_std": 0.0596567802131176, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5189732909202576, "rewards/correct_reward_func/std": 0.18209892511367798, "step": 1501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2538.0, "completions/max_terminated_length": 2538.0, "completions/mean_length": 1543.21435546875, "completions/mean_terminated_length": 1543.21435546875, "completions/min_length": 1093.0, "completions/min_terminated_length": 1093.0, "epoch": 2.339563862928349, "grad_norm": 0.6230779886245728, "kl": 0.05000448226928711, "learning_rate": 1.068125e-06, "loss": 0.0275, "num_tokens": 195366855.0, "reward": 1.4863332509994507, "reward_std": 0.07799090445041656, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49823787808418274, "rewards/correct_reward_func/std": 0.14738164842128754, "step": 1502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3517.0, "completions/max_terminated_length": 3517.0, "completions/mean_length": 1622.202392578125, "completions/mean_terminated_length": 1622.202392578125, "completions/min_length": 1007.0, "completions/min_terminated_length": 1007.0, "epoch": 2.3411214953271027, "grad_norm": 0.5411363244056702, "kl": 0.05251741595566273, "learning_rate": 1.0675e-06, "loss": 0.0163, "num_tokens": 195509096.0, "reward": 1.4867935180664062, "reward_std": 0.11418452858924866, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5106030702590942, "rewards/correct_reward_func/std": 0.12160339206457138, "step": 1503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 1477.261962890625, "completions/mean_terminated_length": 1477.261962890625, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 2.3426791277258565, "grad_norm": 0.6142602562904358, "kl": 0.05079326778650284, "learning_rate": 1.066875e-06, "loss": 0.0136, "num_tokens": 195639030.0, "reward": 1.5784060955047607, "reward_std": 0.06388647109270096, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5784059166908264, "rewards/correct_reward_func/std": 0.18396161496639252, "step": 1504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2386.0, "completions/max_terminated_length": 2386.0, "completions/mean_length": 1486.6429443359375, "completions/mean_terminated_length": 1486.6429443359375, "completions/min_length": 337.0, "completions/min_terminated_length": 337.0, "epoch": 2.344236760124611, "grad_norm": 0.5650523900985718, "kl": 0.0528233852237463, "learning_rate": 1.06625e-06, "loss": -0.0313, "num_tokens": 195769794.0, "reward": 1.5481196641921997, "reward_std": 0.06456782668828964, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5481196641921997, "rewards/correct_reward_func/std": 0.17777185142040253, "step": 1505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 1683.0238037109375, "completions/mean_terminated_length": 1604.602294921875, "completions/min_length": 984.0, "completions/min_terminated_length": 984.0, "epoch": 2.3457943925233646, "grad_norm": 0.5712416768074036, "kl": 0.05164938606321812, "learning_rate": 1.065625e-06, "loss": 0.041, "num_tokens": 195917072.0, "reward": 1.5184962749481201, "reward_std": 0.06124607101082802, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5184961557388306, "rewards/correct_reward_func/std": 0.16714932024478912, "step": 1506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2216.0, "completions/max_terminated_length": 2216.0, "completions/mean_length": 1563.2738037109375, "completions/mean_terminated_length": 1563.2738037109375, "completions/min_length": 1024.0, "completions/min_terminated_length": 1024.0, "epoch": 2.3473520249221185, "grad_norm": 0.5845817923545837, "kl": 0.050174521282315254, "learning_rate": 1.065e-06, "loss": 0.0145, "num_tokens": 196054285.0, "reward": 1.477543830871582, "reward_std": 0.08831338584423065, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48944854736328125, "rewards/correct_reward_func/std": 0.17659763991832733, "step": 1507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 1579.357177734375, "completions/mean_terminated_length": 1579.357177734375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 2.3489096573208723, "grad_norm": 0.5994422435760498, "kl": 0.05463994853198528, "learning_rate": 1.064375e-06, "loss": -0.0055, "num_tokens": 196192921.0, "reward": 1.5244969129562378, "reward_std": 0.08881017565727234, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5364015102386475, "rewards/correct_reward_func/std": 0.18006746470928192, "step": 1508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 1501.952392578125, "completions/mean_terminated_length": 1501.952392578125, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 2.350467289719626, "grad_norm": 0.5493417382240295, "kl": 0.051061000674963, "learning_rate": 1.06375e-06, "loss": -0.0104, "num_tokens": 196324929.0, "reward": 1.5005031824111938, "reward_std": 0.055389344692230225, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5005030632019043, "rewards/correct_reward_func/std": 0.15074452757835388, "step": 1509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 1484.15478515625, "completions/mean_terminated_length": 1484.15478515625, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 2.35202492211838, "grad_norm": 0.6482101082801819, "kl": 0.05101570300757885, "learning_rate": 1.063125e-06, "loss": 0.0134, "num_tokens": 196455322.0, "reward": 1.5651980638504028, "reward_std": 0.06702639907598495, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5651980638504028, "rewards/correct_reward_func/std": 0.1886829435825348, "step": 1510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2701.0, "completions/max_terminated_length": 2701.0, "completions/mean_length": 1612.34521484375, "completions/mean_terminated_length": 1612.34521484375, "completions/min_length": 1168.0, "completions/min_terminated_length": 1168.0, "epoch": 2.353582554517134, "grad_norm": 0.5800288319587708, "kl": 0.05016557313501835, "learning_rate": 1.0625e-06, "loss": 0.0174, "num_tokens": 196596873.0, "reward": 1.5006563663482666, "reward_std": 0.11092998832464218, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5244658589363098, "rewards/correct_reward_func/std": 0.16270431876182556, "step": 1511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 1486.5595703125, "completions/mean_terminated_length": 1486.5595703125, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 2.3551401869158877, "grad_norm": 0.5720930695533752, "kl": 0.051380522549152374, "learning_rate": 1.0618749999999999e-06, "loss": 0.0001, "num_tokens": 196727642.0, "reward": 1.4891971349716187, "reward_std": 0.10666787624359131, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5130066871643066, "rewards/correct_reward_func/std": 0.15706582367420197, "step": 1512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2424.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 1609.0357666015625, "completions/mean_terminated_length": 1609.0357666015625, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 2.3566978193146415, "grad_norm": 0.5553697347640991, "kl": 0.05065441131591797, "learning_rate": 1.06125e-06, "loss": 0.0013, "num_tokens": 196868837.0, "reward": 1.5190095901489258, "reward_std": 0.06515847891569138, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5190094709396362, "rewards/correct_reward_func/std": 0.14390747249126434, "step": 1513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 1603.547607421875, "completions/mean_terminated_length": 1603.547607421875, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 2.358255451713396, "grad_norm": 0.5658085346221924, "kl": 0.053572630509734154, "learning_rate": 1.060625e-06, "loss": -0.0091, "num_tokens": 197009469.0, "reward": 1.517641544342041, "reward_std": 0.06398769468069077, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5176414847373962, "rewards/correct_reward_func/std": 0.15851260721683502, "step": 1514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 1612.09521484375, "completions/mean_terminated_length": 1612.09521484375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 2.3598130841121496, "grad_norm": 0.5591447949409485, "kl": 0.05193502642214298, "learning_rate": 1.06e-06, "loss": 0.0272, "num_tokens": 197150951.0, "reward": 1.4987539052963257, "reward_std": 0.07747209072113037, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5106586217880249, "rewards/correct_reward_func/std": 0.14374776184558868, "step": 1515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 1536.2261962890625, "completions/mean_terminated_length": 1536.2261962890625, "completions/min_length": 1140.0, "completions/min_terminated_length": 1140.0, "epoch": 2.3613707165109035, "grad_norm": 0.6274176836013794, "kl": 0.051331739872694016, "learning_rate": 1.059375e-06, "loss": 0.0176, "num_tokens": 197285958.0, "reward": 1.564021348953247, "reward_std": 0.05985343083739281, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5640212297439575, "rewards/correct_reward_func/std": 0.17172639071941376, "step": 1516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2440.0, "completions/max_terminated_length": 2440.0, "completions/mean_length": 1642.3809814453125, "completions/mean_terminated_length": 1642.3809814453125, "completions/min_length": 1089.0, "completions/min_terminated_length": 1089.0, "epoch": 2.3629283489096573, "grad_norm": 0.5952745676040649, "kl": 0.05227385088801384, "learning_rate": 1.05875e-06, "loss": -0.0185, "num_tokens": 197430128.0, "reward": 1.5174399614334106, "reward_std": 0.05514545738697052, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5174399614334106, "rewards/correct_reward_func/std": 0.2031947672367096, "step": 1517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2488.0, "completions/max_terminated_length": 2488.0, "completions/mean_length": 1545.09521484375, "completions/mean_terminated_length": 1545.09521484375, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 2.364485981308411, "grad_norm": 0.5363126993179321, "kl": 0.05197865702211857, "learning_rate": 1.058125e-06, "loss": -0.0116, "num_tokens": 197565778.0, "reward": 1.4422682523727417, "reward_std": 0.05776557698845863, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4422682225704193, "rewards/correct_reward_func/std": 0.15054292976856232, "step": 1518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2608.0, "completions/max_terminated_length": 2608.0, "completions/mean_length": 1569.9405517578125, "completions/mean_terminated_length": 1569.9405517578125, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.366043613707165, "grad_norm": 0.5733745098114014, "kl": 0.051820605993270874, "learning_rate": 1.0575e-06, "loss": -0.0026, "num_tokens": 197703713.0, "reward": 1.5751185417175293, "reward_std": 0.05798014625906944, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5751185417175293, "rewards/correct_reward_func/std": 0.19570593535900116, "step": 1519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 1668.3929443359375, "completions/mean_terminated_length": 1589.795166015625, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 2.367601246105919, "grad_norm": 0.5476160645484924, "kl": 0.04925505816936493, "learning_rate": 1.056875e-06, "loss": 0.0595, "num_tokens": 197849810.0, "reward": 1.4853663444519043, "reward_std": 0.11030389368534088, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49727103114128113, "rewards/correct_reward_func/std": 0.11946482956409454, "step": 1520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2315.0, "completions/max_terminated_length": 2315.0, "completions/mean_length": 1542.2261962890625, "completions/mean_terminated_length": 1542.2261962890625, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 2.369158878504673, "grad_norm": 0.5898823738098145, "kl": 0.0527112428098917, "learning_rate": 1.0562499999999998e-06, "loss": -0.019, "num_tokens": 197985387.0, "reward": 1.4679503440856934, "reward_std": 0.06638707965612411, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46795040369033813, "rewards/correct_reward_func/std": 0.14619490504264832, "step": 1521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2898.0, "completions/max_terminated_length": 2898.0, "completions/mean_length": 1576.3809814453125, "completions/mean_terminated_length": 1576.3809814453125, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 2.370716510903427, "grad_norm": 0.5711277723312378, "kl": 0.052864741533994675, "learning_rate": 1.055625e-06, "loss": 0.0072, "num_tokens": 198123857.0, "reward": 1.5095322132110596, "reward_std": 0.048503756523132324, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5095321536064148, "rewards/correct_reward_func/std": 0.13190624117851257, "step": 1522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 1592.9285888671875, "completions/mean_terminated_length": 1592.9285888671875, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 2.372274143302181, "grad_norm": 0.5910865664482117, "kl": 0.0509920921176672, "learning_rate": 1.0549999999999999e-06, "loss": 0.0033, "num_tokens": 198263651.0, "reward": 1.5064606666564941, "reward_std": 0.08337225019931793, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5183655023574829, "rewards/correct_reward_func/std": 0.1273634284734726, "step": 1523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 1523.6190185546875, "completions/mean_terminated_length": 1523.6190185546875, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "epoch": 2.3738317757009346, "grad_norm": 0.6241156458854675, "kl": 0.05443510040640831, "learning_rate": 1.054375e-06, "loss": 0.0096, "num_tokens": 198397587.0, "reward": 1.483352541923523, "reward_std": 0.05465046688914299, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4833524823188782, "rewards/correct_reward_func/std": 0.1490633338689804, "step": 1524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 1701.857177734375, "completions/mean_terminated_length": 1623.66259765625, "completions/min_length": 941.0, "completions/min_terminated_length": 941.0, "epoch": 2.3753894080996885, "grad_norm": 0.5477166771888733, "kl": 0.050334708765149117, "learning_rate": 1.0537499999999999e-06, "loss": 0.0323, "num_tokens": 198546627.0, "reward": 1.5251368284225464, "reward_std": 0.07382959872484207, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5251369476318359, "rewards/correct_reward_func/std": 0.1658909022808075, "step": 1525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2948.0, "completions/max_terminated_length": 2948.0, "completions/mean_length": 1503.666748046875, "completions/mean_terminated_length": 1503.666748046875, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 2.3769470404984423, "grad_norm": 0.5741845369338989, "kl": 0.05395548790693283, "learning_rate": 1.053125e-06, "loss": 0.0041, "num_tokens": 198678863.0, "reward": 1.4678336381912231, "reward_std": 0.059803735464811325, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46783357858657837, "rewards/correct_reward_func/std": 0.17054402828216553, "step": 1526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2567.0, "completions/max_terminated_length": 2567.0, "completions/mean_length": 1544.2857666015625, "completions/mean_terminated_length": 1544.2857666015625, "completions/min_length": 649.0, "completions/min_terminated_length": 649.0, "epoch": 2.378504672897196, "grad_norm": 0.5990108251571655, "kl": 0.05085635930299759, "learning_rate": 1.0524999999999999e-06, "loss": 0.0191, "num_tokens": 198814649.0, "reward": 1.5996273756027222, "reward_std": 0.08130278438329697, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5996273159980774, "rewards/correct_reward_func/std": 0.14758199453353882, "step": 1527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 1471.952392578125, "completions/mean_terminated_length": 1471.952392578125, "completions/min_length": 1022.0, "completions/min_terminated_length": 1022.0, "epoch": 2.38006230529595, "grad_norm": 0.5695307850837708, "kl": 0.052955834195017815, "learning_rate": 1.0518749999999998e-06, "loss": 0.0163, "num_tokens": 198944185.0, "reward": 1.5021026134490967, "reward_std": 0.05220888927578926, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5021026730537415, "rewards/correct_reward_func/std": 0.13775546848773956, "step": 1528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2322.0, "completions/max_terminated_length": 2322.0, "completions/mean_length": 1509.916748046875, "completions/mean_terminated_length": 1509.916748046875, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "epoch": 2.381619937694704, "grad_norm": 0.6331456303596497, "kl": 0.05161930434405804, "learning_rate": 1.0512499999999999e-06, "loss": -0.0255, "num_tokens": 199076904.0, "reward": 1.4675413370132446, "reward_std": 0.06481567770242691, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4675411880016327, "rewards/correct_reward_func/std": 0.1194775179028511, "step": 1529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4567.0, "completions/max_terminated_length": 4567.0, "completions/mean_length": 1542.2857666015625, "completions/mean_terminated_length": 1542.2857666015625, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 2.383177570093458, "grad_norm": 0.5621152520179749, "kl": 0.050272012129426, "learning_rate": 1.0506249999999998e-06, "loss": 0.0262, "num_tokens": 199212372.0, "reward": 1.5185221433639526, "reward_std": 0.11595337092876434, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5423315763473511, "rewards/correct_reward_func/std": 0.18327778577804565, "step": 1530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2067.0, "completions/max_terminated_length": 2067.0, "completions/mean_length": 1482.4405517578125, "completions/mean_terminated_length": 1482.4405517578125, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 2.384735202492212, "grad_norm": 0.5831451416015625, "kl": 0.051480814814567566, "learning_rate": 1.05e-06, "loss": 0.0151, "num_tokens": 199342759.0, "reward": 1.512341856956482, "reward_std": 0.08326917141675949, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5242465138435364, "rewards/correct_reward_func/std": 0.12070773541927338, "step": 1531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 1469.84521484375, "completions/mean_terminated_length": 1469.84521484375, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 2.3862928348909658, "grad_norm": 0.6350201964378357, "kl": 0.056153370067477226, "learning_rate": 1.0493749999999998e-06, "loss": 0.0172, "num_tokens": 199472154.0, "reward": 1.465613842010498, "reward_std": 0.04932459443807602, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46561378240585327, "rewards/correct_reward_func/std": 0.15814805030822754, "step": 1532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2583.0, "completions/max_terminated_length": 2583.0, "completions/mean_length": 1523.1429443359375, "completions/mean_terminated_length": 1523.1429443359375, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 2.3878504672897196, "grad_norm": 0.5866072773933411, "kl": 0.05121762678027153, "learning_rate": 1.0487500000000001e-06, "loss": 0.0136, "num_tokens": 199606218.0, "reward": 1.485684871673584, "reward_std": 0.06622068583965302, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48568490147590637, "rewards/correct_reward_func/std": 0.09405961632728577, "step": 1533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2305.0, "completions/max_terminated_length": 2305.0, "completions/mean_length": 1432.1785888671875, "completions/mean_terminated_length": 1432.1785888671875, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "epoch": 2.3894080996884735, "grad_norm": 0.5844257473945618, "kl": 0.051794178783893585, "learning_rate": 1.048125e-06, "loss": 0.0077, "num_tokens": 199732365.0, "reward": 1.5079231262207031, "reward_std": 0.07538687437772751, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5079232454299927, "rewards/correct_reward_func/std": 0.1680031269788742, "step": 1534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2309.0, "completions/max_terminated_length": 2309.0, "completions/mean_length": 1457.9285888671875, "completions/mean_terminated_length": 1457.9285888671875, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "epoch": 2.3909657320872273, "grad_norm": 0.5815176963806152, "kl": 0.052396222949028015, "learning_rate": 1.0475000000000001e-06, "loss": 0.0017, "num_tokens": 199860807.0, "reward": 1.4885550737380981, "reward_std": 0.0876530185341835, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5004598498344421, "rewards/correct_reward_func/std": 0.18733735382556915, "step": 1535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 1586.0833740234375, "completions/mean_terminated_length": 1506.493896484375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 2.392523364485981, "grad_norm": 0.5464408993721008, "kl": 0.050415677949786186, "learning_rate": 1.046875e-06, "loss": 0.0775, "num_tokens": 199999900.0, "reward": 1.4728608131408691, "reward_std": 0.048757996410131454, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4728606939315796, "rewards/correct_reward_func/std": 0.14347362518310547, "step": 1536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2222.0, "completions/max_terminated_length": 2222.0, "completions/mean_length": 1488.1429443359375, "completions/mean_terminated_length": 1488.1429443359375, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "epoch": 2.3940809968847354, "grad_norm": 0.6232819557189941, "kl": 0.05137298069894314, "learning_rate": 1.04625e-06, "loss": -0.0196, "num_tokens": 200130826.0, "reward": 1.4844341278076172, "reward_std": 0.10366541147232056, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5082435607910156, "rewards/correct_reward_func/std": 0.1660507172346115, "step": 1537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2516.0, "completions/max_terminated_length": 2516.0, "completions/mean_length": 1576.0833740234375, "completions/mean_terminated_length": 1576.0833740234375, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 2.3956386292834893, "grad_norm": 0.5646374821662903, "kl": 0.052293770015239716, "learning_rate": 1.045625e-06, "loss": 0.0026, "num_tokens": 200269349.0, "reward": 1.5848065614700317, "reward_std": 0.0654391273856163, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5848065614700317, "rewards/correct_reward_func/std": 0.15569093823432922, "step": 1538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2253.0, "completions/max_terminated_length": 2253.0, "completions/mean_length": 1484.8095703125, "completions/mean_terminated_length": 1484.8095703125, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 2.397196261682243, "grad_norm": 0.6020044088363647, "kl": 0.05074855871498585, "learning_rate": 1.045e-06, "loss": 0.0083, "num_tokens": 200400073.0, "reward": 1.5574504137039185, "reward_std": 0.06154116243124008, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5574504137039185, "rewards/correct_reward_func/std": 0.18654121458530426, "step": 1539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2536.0, "completions/max_terminated_length": 2536.0, "completions/mean_length": 1530.2261962890625, "completions/mean_terminated_length": 1530.2261962890625, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "epoch": 2.398753894080997, "grad_norm": 0.5945984721183777, "kl": 0.0508001409471035, "learning_rate": 1.044375e-06, "loss": 0.0229, "num_tokens": 200534612.0, "reward": 1.5070151090621948, "reward_std": 0.07162400335073471, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5070151090621948, "rewards/correct_reward_func/std": 0.16795474290847778, "step": 1540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3773.0, "completions/max_terminated_length": 3773.0, "completions/mean_length": 1566.7857666015625, "completions/mean_terminated_length": 1566.7857666015625, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 2.4003115264797508, "grad_norm": 0.5954582095146179, "kl": 0.04980020970106125, "learning_rate": 1.04375e-06, "loss": -0.0083, "num_tokens": 200672036.0, "reward": 1.5030343532562256, "reward_std": 0.07617799937725067, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5030342936515808, "rewards/correct_reward_func/std": 0.15437807142734528, "step": 1541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2200.0, "completions/max_terminated_length": 2200.0, "completions/mean_length": 1471.65478515625, "completions/mean_terminated_length": 1471.65478515625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 2.4018691588785046, "grad_norm": 0.6304090023040771, "kl": 0.05186704732477665, "learning_rate": 1.043125e-06, "loss": 0.0389, "num_tokens": 200801643.0, "reward": 1.541716456413269, "reward_std": 0.05233090743422508, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5417162775993347, "rewards/correct_reward_func/std": 0.19871924817562103, "step": 1542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1556.59521484375, "completions/mean_terminated_length": 1476.6505126953125, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 2.4034267912772584, "grad_norm": 0.5754531621932983, "kl": 0.04689878597855568, "learning_rate": 1.0425e-06, "loss": 0.0811, "num_tokens": 200938301.0, "reward": 1.5520706176757812, "reward_std": 0.08832674473524094, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5520706176757812, "rewards/correct_reward_func/std": 0.1472245454788208, "step": 1543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 1593.0595703125, "completions/mean_terminated_length": 1513.55419921875, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 2.4049844236760123, "grad_norm": 0.570315420627594, "kl": 0.052859121933579445, "learning_rate": 1.041875e-06, "loss": 0.0773, "num_tokens": 201078358.0, "reward": 1.5072999000549316, "reward_std": 0.06726644933223724, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5072997212409973, "rewards/correct_reward_func/std": 0.1763041764497757, "step": 1544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2256.0, "completions/max_terminated_length": 2256.0, "completions/mean_length": 1455.5, "completions/mean_terminated_length": 1455.5, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 2.406542056074766, "grad_norm": 0.5854687690734863, "kl": 0.0522833988070488, "learning_rate": 1.04125e-06, "loss": -0.0216, "num_tokens": 201206452.0, "reward": 1.5468658208847046, "reward_std": 0.10119383037090302, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.558770477771759, "rewards/correct_reward_func/std": 0.16341501474380493, "step": 1545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1449.15478515625, "completions/mean_terminated_length": 1449.15478515625, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "epoch": 2.4080996884735204, "grad_norm": 0.6188949942588806, "kl": 0.05467011593282223, "learning_rate": 1.0406249999999999e-06, "loss": 0.0204, "num_tokens": 201334121.0, "reward": 1.499940037727356, "reward_std": 0.04986538365483284, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49993985891342163, "rewards/correct_reward_func/std": 0.17386476695537567, "step": 1546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 1586.511962890625, "completions/mean_terminated_length": 1506.9276123046875, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 2.4096573208722742, "grad_norm": 0.5685256123542786, "kl": 0.047964656725525856, "learning_rate": 1.04e-06, "loss": 0.0586, "num_tokens": 201473364.0, "reward": 1.4286352396011353, "reward_std": 0.09823426604270935, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4405398964881897, "rewards/correct_reward_func/std": 0.16164733469486237, "step": 1547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 1509.8929443359375, "completions/mean_terminated_length": 1509.8929443359375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 2.411214953271028, "grad_norm": 0.6040443181991577, "kl": 0.04977122135460377, "learning_rate": 1.039375e-06, "loss": 0.0016, "num_tokens": 201606261.0, "reward": 1.538719892501831, "reward_std": 0.09641184657812119, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.562529444694519, "rewards/correct_reward_func/std": 0.1265079379081726, "step": 1548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2199.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 1400.7857666015625, "completions/mean_terminated_length": 1400.7857666015625, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 2.412772585669782, "grad_norm": 0.6082838773727417, "kl": 0.05091831833124161, "learning_rate": 1.03875e-06, "loss": 0.019, "num_tokens": 201729843.0, "reward": 1.4924229383468628, "reward_std": 0.07258455455303192, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4924229085445404, "rewards/correct_reward_func/std": 0.19198742508888245, "step": 1549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2348.0, "completions/max_terminated_length": 2348.0, "completions/mean_length": 1457.6429443359375, "completions/mean_terminated_length": 1457.6429443359375, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "epoch": 2.4143302180685358, "grad_norm": 0.5725143551826477, "kl": 0.05122668109834194, "learning_rate": 1.038125e-06, "loss": -0.0211, "num_tokens": 201858255.0, "reward": 1.4791282415390015, "reward_std": 0.08577775210142136, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4910329282283783, "rewards/correct_reward_func/std": 0.1526186764240265, "step": 1550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2695.0, "completions/max_terminated_length": 2695.0, "completions/mean_length": 1490.96435546875, "completions/mean_terminated_length": 1490.96435546875, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 2.4158878504672896, "grad_norm": 0.6268028616905212, "kl": 0.05152355693280697, "learning_rate": 1.0375e-06, "loss": 0.0186, "num_tokens": 201989652.0, "reward": 1.5790352821350098, "reward_std": 0.09141747653484344, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5790351629257202, "rewards/correct_reward_func/std": 0.1544439196586609, "step": 1551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1380.8809814453125, "completions/mean_terminated_length": 1380.8809814453125, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 2.4174454828660434, "grad_norm": 0.5758988857269287, "kl": 0.0513425562530756, "learning_rate": 1.036875e-06, "loss": 0.006, "num_tokens": 202111694.0, "reward": 1.534291386604309, "reward_std": 0.0753999873995781, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5342913866043091, "rewards/correct_reward_func/std": 0.15418009459972382, "step": 1552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 1446.1785888671875, "completions/mean_terminated_length": 1446.1785888671875, "completions/min_length": 764.0, "completions/min_terminated_length": 764.0, "epoch": 2.4190031152647977, "grad_norm": 0.5925214886665344, "kl": 0.05196128599345684, "learning_rate": 1.0362499999999998e-06, "loss": 0.0041, "num_tokens": 202239113.0, "reward": 1.471336007118225, "reward_std": 0.08267415314912796, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4832407534122467, "rewards/correct_reward_func/std": 0.13113440573215485, "step": 1553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2103.0, "completions/max_terminated_length": 2103.0, "completions/mean_length": 1402.0595703125, "completions/mean_terminated_length": 1402.0595703125, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 2.4205607476635516, "grad_norm": 0.6258537769317627, "kl": 0.049265896901488304, "learning_rate": 1.035625e-06, "loss": 0.0225, "num_tokens": 202362880.0, "reward": 1.5272910594940186, "reward_std": 0.05813249200582504, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5272909998893738, "rewards/correct_reward_func/std": 0.1525162011384964, "step": 1554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 1376.0595703125, "completions/mean_terminated_length": 1376.0595703125, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 2.4221183800623054, "grad_norm": 0.6103487014770508, "kl": 0.051622893661260605, "learning_rate": 1.0349999999999998e-06, "loss": -0.0004, "num_tokens": 202484451.0, "reward": 1.470107913017273, "reward_std": 0.07860714942216873, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48201268911361694, "rewards/correct_reward_func/std": 0.12940949201583862, "step": 1555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 1414.607177734375, "completions/mean_terminated_length": 1414.607177734375, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 2.4236760124610592, "grad_norm": 0.5935145616531372, "kl": 0.049898386001586914, "learning_rate": 1.034375e-06, "loss": -0.0268, "num_tokens": 202609074.0, "reward": 1.4826003313064575, "reward_std": 0.05725365877151489, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48260027170181274, "rewards/correct_reward_func/std": 0.13328814506530762, "step": 1556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 1450.3690185546875, "completions/mean_terminated_length": 1450.3690185546875, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 2.425233644859813, "grad_norm": 0.6070877909660339, "kl": 0.050626158714294434, "learning_rate": 1.0337499999999998e-06, "loss": -0.0194, "num_tokens": 202736959.0, "reward": 1.5334666967391968, "reward_std": 0.0529770627617836, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.533466637134552, "rewards/correct_reward_func/std": 0.17534418404102325, "step": 1557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2293.0, "completions/max_terminated_length": 2293.0, "completions/mean_length": 1405.6190185546875, "completions/mean_terminated_length": 1405.6190185546875, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 2.426791277258567, "grad_norm": 0.6116954684257507, "kl": 0.05048785358667374, "learning_rate": 1.033125e-06, "loss": 0.0037, "num_tokens": 202860947.0, "reward": 1.4971402883529663, "reward_std": 0.05506269261240959, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4971402585506439, "rewards/correct_reward_func/std": 0.18587855994701385, "step": 1558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1467.5357666015625, "completions/mean_terminated_length": 1386.51806640625, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 2.4283489096573208, "grad_norm": 0.6054674983024597, "kl": 0.04978436976671219, "learning_rate": 1.0324999999999999e-06, "loss": 0.0538, "num_tokens": 202990226.0, "reward": 1.5439491271972656, "reward_std": 0.06428221613168716, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5439491271972656, "rewards/correct_reward_func/std": 0.21037735044956207, "step": 1559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 1466.5833740234375, "completions/mean_terminated_length": 1385.55419921875, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "epoch": 2.4299065420560746, "grad_norm": 0.5907480716705322, "kl": 0.048747118562459946, "learning_rate": 1.031875e-06, "loss": 0.0628, "num_tokens": 203119251.0, "reward": 1.4438563585281372, "reward_std": 0.06560084968805313, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4438563883304596, "rewards/correct_reward_func/std": 0.1515107899904251, "step": 1560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2389.0, "completions/max_terminated_length": 2389.0, "completions/mean_length": 1425.452392578125, "completions/mean_terminated_length": 1425.452392578125, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 2.4314641744548284, "grad_norm": 0.597564697265625, "kl": 0.05119101516902447, "learning_rate": 1.0312499999999999e-06, "loss": 0.0067, "num_tokens": 203244899.0, "reward": 1.5148341655731201, "reward_std": 0.046356331557035446, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5148341059684753, "rewards/correct_reward_func/std": 0.11878227442502975, "step": 1561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2287.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 1409.96435546875, "completions/mean_terminated_length": 1409.96435546875, "completions/min_length": 489.0, "completions/min_terminated_length": 489.0, "epoch": 2.4330218068535827, "grad_norm": 1.0873229503631592, "kl": 0.07883465476334095, "learning_rate": 1.0306249999999998e-06, "loss": 0.0119, "num_tokens": 203369402.0, "reward": 1.514100193977356, "reward_std": 0.062455326318740845, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5141000747680664, "rewards/correct_reward_func/std": 0.1331133395433426, "step": 1562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1463.5, "completions/mean_terminated_length": 1463.5, "completions/min_length": 636.0, "completions/min_terminated_length": 636.0, "epoch": 2.4345794392523366, "grad_norm": 0.609639048576355, "kl": 0.05311744287610054, "learning_rate": 1.0299999999999999e-06, "loss": -0.0084, "num_tokens": 203498534.0, "reward": 1.4591064453125, "reward_std": 0.05752236768603325, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45910635590553284, "rewards/correct_reward_func/std": 0.10238172113895416, "step": 1563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2006.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1384.34521484375, "completions/mean_terminated_length": 1384.34521484375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 2.4361370716510904, "grad_norm": 0.6002703309059143, "kl": 0.05448311008512974, "learning_rate": 1.0293749999999998e-06, "loss": -0.0268, "num_tokens": 203620753.0, "reward": 1.4818909168243408, "reward_std": 0.07148406654596329, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48189082741737366, "rewards/correct_reward_func/std": 0.15402851998806, "step": 1564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3925.0, "completions/max_terminated_length": 3925.0, "completions/mean_length": 1530.46435546875, "completions/mean_terminated_length": 1530.46435546875, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "epoch": 2.4376947040498442, "grad_norm": 0.5714273452758789, "kl": 0.05160851776599884, "learning_rate": 1.02875e-06, "loss": 0.0042, "num_tokens": 203755558.0, "reward": 1.39243745803833, "reward_std": 0.06817106902599335, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4043421149253845, "rewards/correct_reward_func/std": 0.11170884966850281, "step": 1565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 1570.857177734375, "completions/mean_terminated_length": 1409.3658447265625, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 2.439252336448598, "grad_norm": 0.5355578660964966, "kl": 0.04513352923095226, "learning_rate": 1.028125e-06, "loss": 0.1243, "num_tokens": 203893438.0, "reward": 1.4712097644805908, "reward_std": 0.06838100403547287, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47120967507362366, "rewards/correct_reward_func/std": 0.1638861447572708, "step": 1566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2288.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 1474.761962890625, "completions/mean_terminated_length": 1474.761962890625, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "epoch": 2.440809968847352, "grad_norm": 0.5861567854881287, "kl": 0.04810020141303539, "learning_rate": 1.0275000000000001e-06, "loss": -0.0162, "num_tokens": 204023456.0, "reward": 1.494341254234314, "reward_std": 0.065720334649086, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49434104561805725, "rewards/correct_reward_func/std": 0.1442166566848755, "step": 1567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1978.0, "completions/max_terminated_length": 1978.0, "completions/mean_length": 1410.6429443359375, "completions/mean_terminated_length": 1410.6429443359375, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 2.4423676012461057, "grad_norm": 0.6630206108093262, "kl": 0.05040242150425911, "learning_rate": 1.026875e-06, "loss": 0.0167, "num_tokens": 204147836.0, "reward": 1.492060899734497, "reward_std": 0.08362628519535065, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5039655566215515, "rewards/correct_reward_func/std": 0.15767835080623627, "step": 1568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1421.0238037109375, "completions/mean_terminated_length": 1421.0238037109375, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 2.44392523364486, "grad_norm": 0.6089303493499756, "kl": 0.05176975578069687, "learning_rate": 1.0262500000000001e-06, "loss": 0.0037, "num_tokens": 204273040.0, "reward": 1.4422324895858765, "reward_std": 0.05656725540757179, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4422324001789093, "rewards/correct_reward_func/std": 0.22006554901599884, "step": 1569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 1500.857177734375, "completions/mean_terminated_length": 1420.240966796875, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 2.445482866043614, "grad_norm": 0.5794208645820618, "kl": 0.04869142547249794, "learning_rate": 1.025625e-06, "loss": 0.0764, "num_tokens": 204405022.0, "reward": 1.453341007232666, "reward_std": 0.08583046495914459, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4652457535266876, "rewards/correct_reward_func/std": 0.13295918703079224, "step": 1570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2962.0, "completions/max_terminated_length": 2962.0, "completions/mean_length": 1431.6190185546875, "completions/mean_terminated_length": 1431.6190185546875, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 2.4470404984423677, "grad_norm": 0.5875825881958008, "kl": 0.050680430606007576, "learning_rate": 1.025e-06, "loss": 0.0146, "num_tokens": 204531152.0, "reward": 1.585283637046814, "reward_std": 0.07232575118541718, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5852835774421692, "rewards/correct_reward_func/std": 0.1866348385810852, "step": 1571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2123.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 1447.0714111328125, "completions/mean_terminated_length": 1447.0714111328125, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 2.4485981308411215, "grad_norm": 0.6189795732498169, "kl": 0.05242401361465454, "learning_rate": 1.024375e-06, "loss": 0.0054, "num_tokens": 204658730.0, "reward": 1.512374758720398, "reward_std": 0.07310151308774948, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.512374758720398, "rewards/correct_reward_func/std": 0.15215250849723816, "step": 1572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1389.547607421875, "completions/mean_terminated_length": 1389.547607421875, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 2.4501557632398754, "grad_norm": 0.6245627999305725, "kl": 0.054223209619522095, "learning_rate": 1.02375e-06, "loss": -0.0028, "num_tokens": 204781218.0, "reward": 1.4905513525009155, "reward_std": 0.12359943985939026, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5143609046936035, "rewards/correct_reward_func/std": 0.17044025659561157, "step": 1573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 1472.511962890625, "completions/mean_terminated_length": 1472.511962890625, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 2.4517133956386292, "grad_norm": 0.5789850354194641, "kl": 0.04985970817506313, "learning_rate": 1.023125e-06, "loss": 0.0007, "num_tokens": 204910987.0, "reward": 1.4386543035507202, "reward_std": 0.04152819886803627, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4386541247367859, "rewards/correct_reward_func/std": 0.13864022493362427, "step": 1574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 1462.4881591796875, "completions/mean_terminated_length": 1462.4881591796875, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 2.453271028037383, "grad_norm": 0.6081233024597168, "kl": 0.053139520809054375, "learning_rate": 1.0225e-06, "loss": -0.0189, "num_tokens": 205039884.0, "reward": 1.5262843370437622, "reward_std": 0.10526086390018463, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5381892323493958, "rewards/correct_reward_func/std": 0.1840529888868332, "step": 1575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2453.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 1473.09521484375, "completions/mean_terminated_length": 1473.09521484375, "completions/min_length": 988.0, "completions/min_terminated_length": 988.0, "epoch": 2.454828660436137, "grad_norm": 0.6323999762535095, "kl": 0.05185391753911972, "learning_rate": 1.021875e-06, "loss": 0.0005, "num_tokens": 205169660.0, "reward": 1.5382132530212402, "reward_std": 0.058200761675834656, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5382132530212402, "rewards/correct_reward_func/std": 0.14009076356887817, "step": 1576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 1472.34521484375, "completions/mean_terminated_length": 1472.34521484375, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 2.4563862928348907, "grad_norm": 0.6209440231323242, "kl": 0.05240609683096409, "learning_rate": 1.02125e-06, "loss": 0.018, "num_tokens": 205299247.0, "reward": 1.4798184633255005, "reward_std": 0.05038895085453987, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47981828451156616, "rewards/correct_reward_func/std": 0.10969238728284836, "step": 1577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 1482.25, "completions/mean_terminated_length": 1482.25, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 2.457943925233645, "grad_norm": 0.6004282832145691, "kl": 0.052911147475242615, "learning_rate": 1.0206249999999999e-06, "loss": 0.0009, "num_tokens": 205429762.0, "reward": 1.4458261728286743, "reward_std": 0.06990920007228851, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4458260238170624, "rewards/correct_reward_func/std": 0.10448075830936432, "step": 1578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2208.0, "completions/max_terminated_length": 2208.0, "completions/mean_length": 1445.0833740234375, "completions/mean_terminated_length": 1445.0833740234375, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 2.459501557632399, "grad_norm": 0.601300835609436, "kl": 0.051306189969182014, "learning_rate": 1.02e-06, "loss": 0.0173, "num_tokens": 205557125.0, "reward": 1.5340887308120728, "reward_std": 0.07774235308170319, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5340886116027832, "rewards/correct_reward_func/std": 0.17084696888923645, "step": 1579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2431.0, "completions/max_terminated_length": 2431.0, "completions/mean_length": 1480.0357666015625, "completions/mean_terminated_length": 1480.0357666015625, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 2.4610591900311527, "grad_norm": 0.5862287878990173, "kl": 0.051379214972257614, "learning_rate": 1.0193749999999999e-06, "loss": -0.0023, "num_tokens": 205687388.0, "reward": 1.449748158454895, "reward_std": 0.07117250561714172, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4497482180595398, "rewards/correct_reward_func/std": 0.1502813845872879, "step": 1580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 1542.8333740234375, "completions/mean_terminated_length": 1542.8333740234375, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "epoch": 2.4626168224299065, "grad_norm": 0.612071692943573, "kl": 0.049859074875712395, "learning_rate": 1.01875e-06, "loss": 0.0069, "num_tokens": 205823064.0, "reward": 1.5895568132400513, "reward_std": 0.060309309512376785, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5895566940307617, "rewards/correct_reward_func/std": 0.17404243350028992, "step": 1581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2384.0, "completions/max_terminated_length": 2384.0, "completions/mean_length": 1478.797607421875, "completions/mean_terminated_length": 1478.797607421875, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 2.4641744548286604, "grad_norm": 0.5573348999023438, "kl": 0.05294640548527241, "learning_rate": 1.0181249999999999e-06, "loss": -0.0366, "num_tokens": 205953091.0, "reward": 1.4361010789871216, "reward_std": 0.047801658511161804, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43610095977783203, "rewards/correct_reward_func/std": 0.1530381739139557, "step": 1582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 1467.7738037109375, "completions/mean_terminated_length": 1467.7738037109375, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "epoch": 2.465732087227414, "grad_norm": 0.6175753474235535, "kl": 0.05216800235211849, "learning_rate": 1.0175e-06, "loss": 0.0194, "num_tokens": 206082408.0, "reward": 1.4596354961395264, "reward_std": 0.059151798486709595, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45963531732559204, "rewards/correct_reward_func/std": 0.10623925924301147, "step": 1583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2576.0, "completions/max_terminated_length": 2576.0, "completions/mean_length": 1525.2261962890625, "completions/mean_terminated_length": 1525.2261962890625, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 2.467289719626168, "grad_norm": 0.6107982993125916, "kl": 0.05199519172310829, "learning_rate": 1.016875e-06, "loss": -0.0195, "num_tokens": 206216317.0, "reward": 1.492191195487976, "reward_std": 0.06253939867019653, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49219104647636414, "rewards/correct_reward_func/std": 0.119044728577137, "step": 1584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 1507.666748046875, "completions/mean_terminated_length": 1507.666748046875, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 2.4688473520249223, "grad_norm": 0.5693157315254211, "kl": 0.04964713379740715, "learning_rate": 1.01625e-06, "loss": 0.0014, "num_tokens": 206348931.0, "reward": 1.4573475122451782, "reward_std": 0.12318005412817001, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4811570346355438, "rewards/correct_reward_func/std": 0.14985983073711395, "step": 1585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 1485.607177734375, "completions/mean_terminated_length": 1485.607177734375, "completions/min_length": 662.0, "completions/min_terminated_length": 662.0, "epoch": 2.470404984423676, "grad_norm": 0.5818580389022827, "kl": 0.05325938202440739, "learning_rate": 1.015625e-06, "loss": 0.0263, "num_tokens": 206479668.0, "reward": 1.5271010398864746, "reward_std": 0.07464922964572906, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5271009206771851, "rewards/correct_reward_func/std": 0.1695384681224823, "step": 1586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 1588.5595703125, "completions/mean_terminated_length": 1509.0, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 2.47196261682243, "grad_norm": 0.5441533327102661, "kl": 0.05095021240413189, "learning_rate": 1.0149999999999998e-06, "loss": 0.0448, "num_tokens": 206619167.0, "reward": 1.5085726976394653, "reward_std": 0.11460548639297485, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5323821902275085, "rewards/correct_reward_func/std": 0.16604027152061462, "step": 1587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2949.0, "completions/max_terminated_length": 2949.0, "completions/mean_length": 1557.0714111328125, "completions/mean_terminated_length": 1557.0714111328125, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "epoch": 2.473520249221184, "grad_norm": 0.5785442590713501, "kl": 0.05216461047530174, "learning_rate": 1.014375e-06, "loss": 0.0267, "num_tokens": 206756009.0, "reward": 1.4966325759887695, "reward_std": 0.05284800007939339, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49663248658180237, "rewards/correct_reward_func/std": 0.17222245037555695, "step": 1588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3918.0, "completions/mean_length": 1628.40478515625, "completions/mean_terminated_length": 1549.3251953125, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 2.4750778816199377, "grad_norm": 0.5639784336090088, "kl": 0.04856296256184578, "learning_rate": 1.0137499999999998e-06, "loss": 0.0699, "num_tokens": 206898681.0, "reward": 1.5070228576660156, "reward_std": 0.10061001032590866, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5070227980613708, "rewards/correct_reward_func/std": 0.1676253378391266, "step": 1589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2436.0, "completions/max_terminated_length": 2436.0, "completions/mean_length": 1531.3095703125, "completions/mean_terminated_length": 1531.3095703125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 2.4766355140186915, "grad_norm": 0.6317888498306274, "kl": 0.051237037405371666, "learning_rate": 1.013125e-06, "loss": 0.0307, "num_tokens": 207033407.0, "reward": 1.4436553716659546, "reward_std": 0.062011465430259705, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4436551630496979, "rewards/correct_reward_func/std": 0.1413443237543106, "step": 1590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1469.8095703125, "completions/mean_terminated_length": 1469.8095703125, "completions/min_length": 1057.0, "completions/min_terminated_length": 1057.0, "epoch": 2.4781931464174454, "grad_norm": 0.5992113947868347, "kl": 0.05155971460044384, "learning_rate": 1.0124999999999998e-06, "loss": 0.0056, "num_tokens": 207162757.0, "reward": 1.5198308229446411, "reward_std": 0.06754729151725769, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5198308229446411, "rewards/correct_reward_func/std": 0.13552133738994598, "step": 1591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2784.0, "completions/max_terminated_length": 2784.0, "completions/mean_length": 1561.6429443359375, "completions/mean_terminated_length": 1561.6429443359375, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 2.479750778816199, "grad_norm": 0.5880371332168579, "kl": 0.05097147636115551, "learning_rate": 1.011875e-06, "loss": -0.0197, "num_tokens": 207299833.0, "reward": 1.47348952293396, "reward_std": 0.08504419028759003, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48539412021636963, "rewards/correct_reward_func/std": 0.1555197685956955, "step": 1592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 1507.107177734375, "completions/mean_terminated_length": 1507.107177734375, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "epoch": 2.481308411214953, "grad_norm": 0.6277386546134949, "kl": 0.0506266113370657, "learning_rate": 1.0112499999999998e-06, "loss": 0.0015, "num_tokens": 207432436.0, "reward": 1.530014991760254, "reward_std": 0.05494409799575806, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5300148725509644, "rewards/correct_reward_func/std": 0.13875579833984375, "step": 1593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4265.0, "completions/max_terminated_length": 4265.0, "completions/mean_length": 1516.297607421875, "completions/mean_terminated_length": 1516.297607421875, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "epoch": 2.4828660436137073, "grad_norm": 0.5781358480453491, "kl": 0.05219533480703831, "learning_rate": 1.010625e-06, "loss": 0.0097, "num_tokens": 207565817.0, "reward": 1.4938114881515503, "reward_std": 0.08440135419368744, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5057162046432495, "rewards/correct_reward_func/std": 0.13546234369277954, "step": 1594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 1549.3214111328125, "completions/mean_terminated_length": 1549.3214111328125, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 2.484423676012461, "grad_norm": 0.5642051696777344, "kl": 0.04951576888561249, "learning_rate": 1.0099999999999999e-06, "loss": -0.0009, "num_tokens": 207702062.0, "reward": 1.5093501806259155, "reward_std": 0.09381642937660217, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5212547779083252, "rewards/correct_reward_func/std": 0.14744646847248077, "step": 1595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2357.0, "completions/max_terminated_length": 2357.0, "completions/mean_length": 1568.8095703125, "completions/mean_terminated_length": 1568.8095703125, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 2.485981308411215, "grad_norm": 0.5947378873825073, "kl": 0.04908100888133049, "learning_rate": 1.009375e-06, "loss": 0.0191, "num_tokens": 207839974.0, "reward": 1.461796522140503, "reward_std": 0.059961721301078796, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46179643273353577, "rewards/correct_reward_func/std": 0.17661380767822266, "step": 1596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2634.0, "completions/max_terminated_length": 2634.0, "completions/mean_length": 1569.9405517578125, "completions/mean_terminated_length": 1569.9405517578125, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 2.487538940809969, "grad_norm": 0.5761712789535522, "kl": 0.0520041324198246, "learning_rate": 1.00875e-06, "loss": 0.006, "num_tokens": 207977687.0, "reward": 1.506062626838684, "reward_std": 0.09736565500497818, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5179673433303833, "rewards/correct_reward_func/std": 0.15767443180084229, "step": 1597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 1499.8095703125, "completions/mean_terminated_length": 1499.8095703125, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 2.4890965732087227, "grad_norm": 0.6091989278793335, "kl": 0.05175224132835865, "learning_rate": 1.008125e-06, "loss": 0.0043, "num_tokens": 208109431.0, "reward": 1.4653962850570679, "reward_std": 0.05529634281992912, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4653961658477783, "rewards/correct_reward_func/std": 0.14931879937648773, "step": 1598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 1588.3690185546875, "completions/mean_terminated_length": 1588.3690185546875, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "epoch": 2.4906542056074765, "grad_norm": 0.6155436635017395, "kl": 0.05598420277237892, "learning_rate": 1.0075e-06, "loss": -0.0226, "num_tokens": 208248824.0, "reward": 1.5021013021469116, "reward_std": 0.07482831180095673, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5021011829376221, "rewards/correct_reward_func/std": 0.16470223665237427, "step": 1599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2709.0, "completions/mean_length": 1588.5, "completions/mean_terminated_length": 1508.939697265625, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 2.4922118380062304, "grad_norm": 0.559097409248352, "kl": 0.04784083552658558, "learning_rate": 1.006875e-06, "loss": 0.0683, "num_tokens": 208388306.0, "reward": 1.5173500776290894, "reward_std": 0.0767589882016182, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.517349898815155, "rewards/correct_reward_func/std": 0.16332249343395233, "step": 1600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2510.0, "completions/max_terminated_length": 2510.0, "completions/mean_length": 1504.0357666015625, "completions/mean_terminated_length": 1504.0357666015625, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 2.4937694704049846, "grad_norm": 0.5860721468925476, "kl": 0.05065094865858555, "learning_rate": 1.0062500000000001e-06, "loss": -0.0003, "num_tokens": 208520585.0, "reward": 1.561385989189148, "reward_std": 0.043802518397569656, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5613857507705688, "rewards/correct_reward_func/std": 0.13197611272335052, "step": 1601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2453.0, "completions/mean_length": 1596.0357666015625, "completions/mean_terminated_length": 1516.566162109375, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 2.4953271028037385, "grad_norm": 0.5750558972358704, "kl": 0.04949954338371754, "learning_rate": 1.005625e-06, "loss": 0.0731, "num_tokens": 208660634.0, "reward": 1.5180797576904297, "reward_std": 0.09522654861211777, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5299844741821289, "rewards/correct_reward_func/std": 0.16583122313022614, "step": 1602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2887.0, "completions/max_terminated_length": 2887.0, "completions/mean_length": 1562.0595703125, "completions/mean_terminated_length": 1562.0595703125, "completions/min_length": 1076.0, "completions/min_terminated_length": 1076.0, "epoch": 2.4968847352024923, "grad_norm": 0.6012449264526367, "kl": 0.04941224493086338, "learning_rate": 1.005e-06, "loss": -0.0131, "num_tokens": 208797973.0, "reward": 1.4961917400360107, "reward_std": 0.055871374905109406, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4961915612220764, "rewards/correct_reward_func/std": 0.1674424558877945, "step": 1603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2355.0, "completions/mean_length": 1651.5357666015625, "completions/mean_terminated_length": 1572.73486328125, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 2.498442367601246, "grad_norm": 0.5377492308616638, "kl": 0.04983988776803017, "learning_rate": 1.004375e-06, "loss": 0.0742, "num_tokens": 208942870.0, "reward": 1.467326283454895, "reward_std": 0.08338472247123718, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.479231059551239, "rewards/correct_reward_func/std": 0.16722595691680908, "step": 1604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 1514.452392578125, "completions/mean_terminated_length": 1514.452392578125, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 2.5, "grad_norm": 0.6342684626579285, "kl": 0.051447538658976555, "learning_rate": 1.00375e-06, "loss": 0.0017, "num_tokens": 209076192.0, "reward": 1.4550957679748535, "reward_std": 0.05658799409866333, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4550958275794983, "rewards/correct_reward_func/std": 0.14392852783203125, "step": 1605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1658.3333740234375, "completions/mean_terminated_length": 1579.6143798828125, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 2.501557632398754, "grad_norm": 0.5678313970565796, "kl": 0.04905891604721546, "learning_rate": 1.003125e-06, "loss": 0.0405, "num_tokens": 209221636.0, "reward": 1.4966896772384644, "reward_std": 0.05913669988512993, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49668970704078674, "rewards/correct_reward_func/std": 0.15522529184818268, "step": 1606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2498.0, "completions/max_terminated_length": 2498.0, "completions/mean_length": 1539.6429443359375, "completions/mean_terminated_length": 1539.6429443359375, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 2.5031152647975077, "grad_norm": 0.5939035415649414, "kl": 0.04927397333085537, "learning_rate": 1.0025e-06, "loss": 0.0332, "num_tokens": 209357032.0, "reward": 1.5666331052780151, "reward_std": 0.06067349389195442, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5666331052780151, "rewards/correct_reward_func/std": 0.12564845383167267, "step": 1607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1672.4761962890625, "completions/mean_terminated_length": 1513.46337890625, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 2.5046728971962615, "grad_norm": 0.529949963092804, "kl": 0.04862845875322819, "learning_rate": 1.001875e-06, "loss": 0.0895, "num_tokens": 209503538.0, "reward": 1.4722119569778442, "reward_std": 0.11629762500524521, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4841166138648987, "rewards/correct_reward_func/std": 0.16969376802444458, "step": 1608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2491.0, "completions/mean_length": 1638.3690185546875, "completions/mean_terminated_length": 1559.4095458984375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 2.5062305295950154, "grad_norm": 0.5374082326889038, "kl": 0.05002483166754246, "learning_rate": 1.00125e-06, "loss": 0.0353, "num_tokens": 209646981.0, "reward": 1.4784367084503174, "reward_std": 0.05023537576198578, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4784366488456726, "rewards/correct_reward_func/std": 0.184120312333107, "step": 1609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1494.857177734375, "completions/mean_terminated_length": 1494.857177734375, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "epoch": 2.507788161993769, "grad_norm": 0.5800879001617432, "kl": 0.052175672724843025, "learning_rate": 1.000625e-06, "loss": 0.008, "num_tokens": 209778471.0, "reward": 1.5175479650497437, "reward_std": 0.04924726486206055, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5175480842590332, "rewards/correct_reward_func/std": 0.19751298427581787, "step": 1610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2296.0, "completions/max_terminated_length": 2296.0, "completions/mean_length": 1453.0238037109375, "completions/mean_terminated_length": 1453.0238037109375, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "epoch": 2.5093457943925235, "grad_norm": 0.5783474445343018, "kl": 0.050958720967173576, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 209906471.0, "reward": 1.534216046333313, "reward_std": 0.06847457587718964, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.534216046333313, "rewards/correct_reward_func/std": 0.14987419545650482, "step": 1611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 1557.2857666015625, "completions/mean_terminated_length": 1557.2857666015625, "completions/min_length": 1043.0, "completions/min_terminated_length": 1043.0, "epoch": 2.5109034267912773, "grad_norm": 0.5748253464698792, "kl": 0.04951940476894379, "learning_rate": 9.99375e-07, "loss": -0.0112, "num_tokens": 210043427.0, "reward": 1.5567976236343384, "reward_std": 0.045837413519620895, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5567975044250488, "rewards/correct_reward_func/std": 0.12818868458271027, "step": 1612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1509.261962890625, "completions/mean_terminated_length": 1509.261962890625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 2.512461059190031, "grad_norm": 0.6009491086006165, "kl": 0.05423161759972572, "learning_rate": 9.9875e-07, "loss": -0.0073, "num_tokens": 210176193.0, "reward": 1.5251411199569702, "reward_std": 0.09009889513254166, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5370456576347351, "rewards/correct_reward_func/std": 0.1600138396024704, "step": 1613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2616.0, "completions/max_terminated_length": 2616.0, "completions/mean_length": 1544.511962890625, "completions/mean_terminated_length": 1544.511962890625, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 2.514018691588785, "grad_norm": 0.5461153388023376, "kl": 0.05127429775893688, "learning_rate": 9.98125e-07, "loss": -0.0364, "num_tokens": 210311902.0, "reward": 1.4934782981872559, "reward_std": 0.04722701013088226, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4934782385826111, "rewards/correct_reward_func/std": 0.14512115716934204, "step": 1614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2252.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 1504.452392578125, "completions/mean_terminated_length": 1504.452392578125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 2.515576323987539, "grad_norm": 0.564798891544342, "kl": 0.05339285172522068, "learning_rate": 9.975e-07, "loss": 0.0123, "num_tokens": 210444324.0, "reward": 1.4929882287979126, "reward_std": 0.06424888223409653, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4929882287979126, "rewards/correct_reward_func/std": 0.14157631993293762, "step": 1615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2485.0, "completions/max_terminated_length": 2485.0, "completions/mean_length": 1551.0238037109375, "completions/mean_terminated_length": 1551.0238037109375, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 2.5171339563862927, "grad_norm": 0.5884081125259399, "kl": 0.050581540912389755, "learning_rate": 9.968749999999999e-07, "loss": 0.0114, "num_tokens": 210580568.0, "reward": 1.5508390665054321, "reward_std": 0.08816705644130707, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5627437829971313, "rewards/correct_reward_func/std": 0.1634836047887802, "step": 1616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 1516.107177734375, "completions/mean_terminated_length": 1516.107177734375, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "epoch": 2.518691588785047, "grad_norm": 0.5574733018875122, "kl": 0.05302547477185726, "learning_rate": 9.9625e-07, "loss": -0.0271, "num_tokens": 210714107.0, "reward": 1.49958336353302, "reward_std": 0.05389043316245079, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49958324432373047, "rewards/correct_reward_func/std": 0.17000599205493927, "step": 1617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2800.0, "completions/mean_length": 1633.3690185546875, "completions/mean_terminated_length": 1554.349365234375, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "epoch": 2.520249221183801, "grad_norm": 0.5731033682823181, "kl": 0.049445370212197304, "learning_rate": 9.956249999999999e-07, "loss": 0.0768, "num_tokens": 210857208.0, "reward": 1.5091835260391235, "reward_std": 0.08943185210227966, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.521088182926178, "rewards/correct_reward_func/std": 0.20577535033226013, "step": 1618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2830.0, "completions/max_terminated_length": 2830.0, "completions/mean_length": 1511.46435546875, "completions/mean_terminated_length": 1511.46435546875, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 2.5218068535825546, "grad_norm": 0.5607713460922241, "kl": 0.05166306160390377, "learning_rate": 9.95e-07, "loss": 0.0052, "num_tokens": 210990147.0, "reward": 1.5118337869644165, "reward_std": 0.055470749735832214, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.511833667755127, "rewards/correct_reward_func/std": 0.16678674519062042, "step": 1619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2299.0, "completions/max_terminated_length": 2299.0, "completions/mean_length": 1515.5357666015625, "completions/mean_terminated_length": 1515.5357666015625, "completions/min_length": 587.0, "completions/min_terminated_length": 587.0, "epoch": 2.5233644859813085, "grad_norm": 0.602757453918457, "kl": 0.05295521579682827, "learning_rate": 9.94375e-07, "loss": 0.0013, "num_tokens": 211123644.0, "reward": 1.477386474609375, "reward_std": 0.06374260783195496, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4773864150047302, "rewards/correct_reward_func/std": 0.12893356382846832, "step": 1620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2761.0, "completions/max_terminated_length": 2761.0, "completions/mean_length": 1595.357177734375, "completions/mean_terminated_length": 1595.357177734375, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "epoch": 2.5249221183800623, "grad_norm": 0.5467821955680847, "kl": 0.051239559426903725, "learning_rate": 9.9375e-07, "loss": 0.0346, "num_tokens": 211263870.0, "reward": 1.5702136754989624, "reward_std": 0.07906346023082733, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5821185111999512, "rewards/correct_reward_func/std": 0.16426138579845428, "step": 1621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1505.0357666015625, "completions/mean_terminated_length": 1505.0357666015625, "completions/min_length": 998.0, "completions/min_terminated_length": 998.0, "epoch": 2.526479750778816, "grad_norm": 0.6225073933601379, "kl": 0.055177273228764534, "learning_rate": 9.93125e-07, "loss": -0.0098, "num_tokens": 211396203.0, "reward": 1.4895132780075073, "reward_std": 0.05655066296458244, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4895133078098297, "rewards/correct_reward_func/std": 0.14057649672031403, "step": 1622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1568.047607421875, "completions/mean_terminated_length": 1568.047607421875, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "epoch": 2.52803738317757, "grad_norm": 0.5538238883018494, "kl": 0.050687333568930626, "learning_rate": 9.925e-07, "loss": 0.0157, "num_tokens": 211533955.0, "reward": 1.5081019401550293, "reward_std": 0.08804195374250412, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.520006537437439, "rewards/correct_reward_func/std": 0.1863691210746765, "step": 1623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 1485.166748046875, "completions/mean_terminated_length": 1485.166748046875, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "epoch": 2.529595015576324, "grad_norm": 0.5821257829666138, "kl": 0.051482876762747765, "learning_rate": 9.91875e-07, "loss": 0.0154, "num_tokens": 211664535.0, "reward": 1.4470605850219727, "reward_std": 0.052217576652765274, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4470604360103607, "rewards/correct_reward_func/std": 0.14432884752750397, "step": 1624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2692.0, "completions/max_terminated_length": 2692.0, "completions/mean_length": 1505.9285888671875, "completions/mean_terminated_length": 1505.9285888671875, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 2.5311526479750777, "grad_norm": 0.5766447186470032, "kl": 0.052913960069417953, "learning_rate": 9.912499999999998e-07, "loss": -0.0009, "num_tokens": 211796955.0, "reward": 1.4787414073944092, "reward_std": 0.1025603860616684, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.490646094083786, "rewards/correct_reward_func/std": 0.170021191239357, "step": 1625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2582.0, "completions/mean_length": 1602.7261962890625, "completions/mean_terminated_length": 1523.3372802734375, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "epoch": 2.5327102803738315, "grad_norm": 0.5610179305076599, "kl": 0.049728455021977425, "learning_rate": 9.90625e-07, "loss": 0.0735, "num_tokens": 211937470.0, "reward": 1.527881145477295, "reward_std": 0.1073220819234848, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5397859215736389, "rewards/correct_reward_func/std": 0.19870956242084503, "step": 1626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2508.0, "completions/max_terminated_length": 2508.0, "completions/mean_length": 1516.5, "completions/mean_terminated_length": 1516.5, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 2.534267912772586, "grad_norm": 0.6085204482078552, "kl": 0.04930474795401096, "learning_rate": 9.9e-07, "loss": 0.0194, "num_tokens": 212070862.0, "reward": 1.561302900314331, "reward_std": 0.08325515687465668, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5732077360153198, "rewards/correct_reward_func/std": 0.1891576498746872, "step": 1627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2693.0, "completions/max_terminated_length": 2693.0, "completions/mean_length": 1524.9405517578125, "completions/mean_terminated_length": 1524.9405517578125, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 2.5358255451713396, "grad_norm": 0.6082260608673096, "kl": 0.052275942638516426, "learning_rate": 9.89375e-07, "loss": 0.015, "num_tokens": 212204951.0, "reward": 1.5587267875671387, "reward_std": 0.07445348799228668, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5587266683578491, "rewards/correct_reward_func/std": 0.17131729423999786, "step": 1628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2365.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 1447.5833740234375, "completions/mean_terminated_length": 1447.5833740234375, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 2.5373831775700935, "grad_norm": 0.5918328762054443, "kl": 0.0515949260443449, "learning_rate": 9.8875e-07, "loss": -0.0138, "num_tokens": 212332512.0, "reward": 1.4934695959091187, "reward_std": 0.04861073195934296, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4934695065021515, "rewards/correct_reward_func/std": 0.165501207113266, "step": 1629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 1552.6429443359375, "completions/mean_terminated_length": 1552.6429443359375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 2.5389408099688473, "grad_norm": 0.5309395790100098, "kl": 0.05055820755660534, "learning_rate": 9.88125e-07, "loss": -0.0086, "num_tokens": 212469066.0, "reward": 1.5057357549667358, "reward_std": 0.054260846227407455, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5057356953620911, "rewards/correct_reward_func/std": 0.15043847262859344, "step": 1630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2247.0, "completions/max_terminated_length": 2247.0, "completions/mean_length": 1521.202392578125, "completions/mean_terminated_length": 1521.202392578125, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.540498442367601, "grad_norm": 0.5420085191726685, "kl": 0.05097649060189724, "learning_rate": 9.875e-07, "loss": -0.012, "num_tokens": 212603033.0, "reward": 1.4791878461837769, "reward_std": 0.08031619340181351, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4910925328731537, "rewards/correct_reward_func/std": 0.13377530872821808, "step": 1631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2378.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 1494.5357666015625, "completions/mean_terminated_length": 1494.5357666015625, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 2.542056074766355, "grad_norm": 0.5785639882087708, "kl": 0.05017243139445782, "learning_rate": 9.86875e-07, "loss": 0.0009, "num_tokens": 212734676.0, "reward": 1.5313811302185059, "reward_std": 0.054354630410671234, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5313810706138611, "rewards/correct_reward_func/std": 0.14237754046916962, "step": 1632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2082.0, "completions/max_terminated_length": 2082.0, "completions/mean_length": 1470.2857666015625, "completions/mean_terminated_length": 1470.2857666015625, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 2.5436137071651093, "grad_norm": 0.5943959951400757, "kl": 0.050632892176508904, "learning_rate": 9.862499999999999e-07, "loss": 0.0157, "num_tokens": 212864150.0, "reward": 1.488013744354248, "reward_std": 0.07632701098918915, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49991849064826965, "rewards/correct_reward_func/std": 0.13955065608024597, "step": 1633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2765.0, "completions/max_terminated_length": 2765.0, "completions/mean_length": 1487.0714111328125, "completions/mean_terminated_length": 1487.0714111328125, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 2.545171339563863, "grad_norm": 0.5672664046287537, "kl": 0.048811692744493484, "learning_rate": 9.85625e-07, "loss": -0.0166, "num_tokens": 212994950.0, "reward": 1.5350711345672607, "reward_std": 0.04656928405165672, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.535071074962616, "rewards/correct_reward_func/std": 0.15840326249599457, "step": 1634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1512.2381591796875, "completions/mean_terminated_length": 1512.2381591796875, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 2.546728971962617, "grad_norm": 0.5941212773323059, "kl": 0.0509329829365015, "learning_rate": 9.849999999999999e-07, "loss": 0.0083, "num_tokens": 213127930.0, "reward": 1.496375322341919, "reward_std": 0.10456540435552597, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5201847553253174, "rewards/correct_reward_func/std": 0.13638147711753845, "step": 1635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1518.25, "completions/mean_terminated_length": 1518.25, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "epoch": 2.5482866043613708, "grad_norm": 0.5865824222564697, "kl": 0.04903215542435646, "learning_rate": 9.84375e-07, "loss": 0.0121, "num_tokens": 213261493.0, "reward": 1.5216844081878662, "reward_std": 0.038109954446554184, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5216842889785767, "rewards/correct_reward_func/std": 0.14145228266716003, "step": 1636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 1494.261962890625, "completions/mean_terminated_length": 1494.261962890625, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 2.5498442367601246, "grad_norm": 0.5987250804901123, "kl": 0.05030805431306362, "learning_rate": 9.8375e-07, "loss": 0.0168, "num_tokens": 213392867.0, "reward": 1.4683319330215454, "reward_std": 0.05512666702270508, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4683319330215454, "rewards/correct_reward_func/std": 0.1699049472808838, "step": 1637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2458.0, "completions/max_terminated_length": 2458.0, "completions/mean_length": 1477.1785888671875, "completions/mean_terminated_length": 1477.1785888671875, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 2.5514018691588785, "grad_norm": 0.5852728486061096, "kl": 0.049666061997413635, "learning_rate": 9.83125e-07, "loss": 0.0072, "num_tokens": 213522980.0, "reward": 1.471039891242981, "reward_std": 0.05152140557765961, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47103986144065857, "rewards/correct_reward_func/std": 0.13039211928844452, "step": 1638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2803.0, "completions/max_terminated_length": 2803.0, "completions/mean_length": 1535.2857666015625, "completions/mean_terminated_length": 1535.2857666015625, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 2.5529595015576323, "grad_norm": 0.5496641993522644, "kl": 0.05307148024439812, "learning_rate": 9.825e-07, "loss": 0.0176, "num_tokens": 213658052.0, "reward": 1.5162016153335571, "reward_std": 0.0574827715754509, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5162014961242676, "rewards/correct_reward_func/std": 0.16080212593078613, "step": 1639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1476.666748046875, "completions/mean_terminated_length": 1476.666748046875, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "epoch": 2.554517133956386, "grad_norm": 0.6022453904151917, "kl": 0.050816189497709274, "learning_rate": 9.81875e-07, "loss": 0.0191, "num_tokens": 213788032.0, "reward": 1.556488037109375, "reward_std": 0.07109459489583969, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5564881563186646, "rewards/correct_reward_func/std": 0.12744058668613434, "step": 1640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1468.09521484375, "completions/mean_terminated_length": 1468.09521484375, "completions/min_length": 931.0, "completions/min_terminated_length": 931.0, "epoch": 2.55607476635514, "grad_norm": 0.591855525970459, "kl": 0.04847092740237713, "learning_rate": 9.8125e-07, "loss": -0.0024, "num_tokens": 213917460.0, "reward": 1.4558780193328857, "reward_std": 0.07982868701219559, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4677827060222626, "rewards/correct_reward_func/std": 0.17505809664726257, "step": 1641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 1426.5833740234375, "completions/mean_terminated_length": 1426.5833740234375, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 2.557632398753894, "grad_norm": 0.5832905769348145, "kl": 0.05035056546330452, "learning_rate": 9.806249999999998e-07, "loss": 0.0064, "num_tokens": 214043407.0, "reward": 1.595166563987732, "reward_std": 0.06131565198302269, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5951663255691528, "rewards/correct_reward_func/std": 0.11501199752092361, "step": 1642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 1475.547607421875, "completions/mean_terminated_length": 1475.547607421875, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 2.559190031152648, "grad_norm": 0.5847628116607666, "kl": 0.04987966641783714, "learning_rate": 9.8e-07, "loss": 0.0295, "num_tokens": 214173401.0, "reward": 1.5148693323135376, "reward_std": 0.08328548073768616, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5267741084098816, "rewards/correct_reward_func/std": 0.18570058047771454, "step": 1643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5496.0, "completions/max_terminated_length": 5496.0, "completions/mean_length": 1484.5238037109375, "completions/mean_terminated_length": 1484.5238037109375, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 2.560747663551402, "grad_norm": 0.5943768620491028, "kl": 0.048675015568733215, "learning_rate": 9.79375e-07, "loss": 0.0575, "num_tokens": 214303975.0, "reward": 1.5754634141921997, "reward_std": 0.0642186850309372, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5754635334014893, "rewards/correct_reward_func/std": 0.1508868932723999, "step": 1644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2199.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 1495.8809814453125, "completions/mean_terminated_length": 1495.8809814453125, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 2.5623052959501558, "grad_norm": 0.5496551394462585, "kl": 0.048282695934176445, "learning_rate": 9.7875e-07, "loss": 0.0191, "num_tokens": 214435797.0, "reward": 1.5602271556854248, "reward_std": 0.06466874480247498, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5602271556854248, "rewards/correct_reward_func/std": 0.15355198085308075, "step": 1645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2697.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 1498.511962890625, "completions/mean_terminated_length": 1498.511962890625, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 2.5638629283489096, "grad_norm": 0.6119274497032166, "kl": 0.05113053880631924, "learning_rate": 9.78125e-07, "loss": 0.0022, "num_tokens": 214567744.0, "reward": 1.4761974811553955, "reward_std": 0.07773737609386444, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4881022870540619, "rewards/correct_reward_func/std": 0.1255892813205719, "step": 1646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 1431.7381591796875, "completions/mean_terminated_length": 1431.7381591796875, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 2.5654205607476634, "grad_norm": 0.606296956539154, "kl": 0.05013350769877434, "learning_rate": 9.775e-07, "loss": 0.0027, "num_tokens": 214693854.0, "reward": 1.534840703010559, "reward_std": 0.07128684222698212, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5348407030105591, "rewards/correct_reward_func/std": 0.18061499297618866, "step": 1647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2144.0, "completions/max_terminated_length": 2144.0, "completions/mean_length": 1397.297607421875, "completions/mean_terminated_length": 1397.297607421875, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 2.5669781931464173, "grad_norm": 0.5801633596420288, "kl": 0.04820682480931282, "learning_rate": 9.76875e-07, "loss": -0.0045, "num_tokens": 214817029.0, "reward": 1.5398229360580444, "reward_std": 0.07624775916337967, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5398228764533997, "rewards/correct_reward_func/std": 0.1714152991771698, "step": 1648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2157.0, "completions/max_terminated_length": 2157.0, "completions/mean_length": 1403.857177734375, "completions/mean_terminated_length": 1403.857177734375, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 2.5685358255451716, "grad_norm": 0.5909891128540039, "kl": 0.049853354692459106, "learning_rate": 9.7625e-07, "loss": -0.0255, "num_tokens": 214941115.0, "reward": 1.5494624376296997, "reward_std": 0.048710573464632034, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5494622588157654, "rewards/correct_reward_func/std": 0.1409223973751068, "step": 1649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2401.0, "completions/max_terminated_length": 2401.0, "completions/mean_length": 1483.25, "completions/mean_terminated_length": 1483.25, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 2.5700934579439254, "grad_norm": 0.6053852438926697, "kl": 0.0489772018045187, "learning_rate": 9.756249999999999e-07, "loss": 0.0022, "num_tokens": 215071810.0, "reward": 1.5003608465194702, "reward_std": 0.12241682410240173, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5241703391075134, "rewards/correct_reward_func/std": 0.13935251533985138, "step": 1650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1409.452392578125, "completions/mean_terminated_length": 1409.452392578125, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 2.5716510903426792, "grad_norm": 0.5579494833946228, "kl": 0.049285903573036194, "learning_rate": 9.75e-07, "loss": 0.0069, "num_tokens": 215196000.0, "reward": 1.5749913454055786, "reward_std": 0.05776212364435196, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5749912858009338, "rewards/correct_reward_func/std": 0.13392944633960724, "step": 1651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2138.0, "completions/max_terminated_length": 2138.0, "completions/mean_length": 1409.9761962890625, "completions/mean_terminated_length": 1409.9761962890625, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 2.573208722741433, "grad_norm": 0.6727893352508545, "kl": 0.05051317624747753, "learning_rate": 9.743749999999999e-07, "loss": 0.0248, "num_tokens": 215320300.0, "reward": 1.5002202987670898, "reward_std": 0.08934829384088516, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5121248960494995, "rewards/correct_reward_func/std": 0.1445939540863037, "step": 1652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1380.40478515625, "completions/mean_terminated_length": 1380.40478515625, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 2.574766355140187, "grad_norm": 0.622471034526825, "kl": 0.04954817704856396, "learning_rate": 9.7375e-07, "loss": -0.0084, "num_tokens": 215442206.0, "reward": 1.465014934539795, "reward_std": 0.05216953903436661, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46501487493515015, "rewards/correct_reward_func/std": 0.13378936052322388, "step": 1653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1941.0, "completions/max_terminated_length": 1941.0, "completions/mean_length": 1418.9761962890625, "completions/mean_terminated_length": 1418.9761962890625, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 2.5763239875389408, "grad_norm": 0.6289664506912231, "kl": 0.05199586972594261, "learning_rate": 9.73125e-07, "loss": 0.0112, "num_tokens": 215567496.0, "reward": 1.5112674236297607, "reward_std": 0.06788778305053711, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.511267364025116, "rewards/correct_reward_func/std": 0.1256408989429474, "step": 1654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 1365.8809814453125, "completions/mean_terminated_length": 1365.8809814453125, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 2.5778816199376946, "grad_norm": 0.6551897525787354, "kl": 0.05425109714269638, "learning_rate": 9.725e-07, "loss": 0.0093, "num_tokens": 215688176.0, "reward": 1.4889607429504395, "reward_std": 0.07101956754922867, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48896071314811707, "rewards/correct_reward_func/std": 0.17883706092834473, "step": 1655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1425.96435546875, "completions/mean_terminated_length": 1425.96435546875, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 2.5794392523364484, "grad_norm": 0.6105693578720093, "kl": 0.051016196608543396, "learning_rate": 9.71875e-07, "loss": 0.0046, "num_tokens": 215813849.0, "reward": 1.4934238195419312, "reward_std": 0.05195160210132599, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49342378973960876, "rewards/correct_reward_func/std": 0.1419934630393982, "step": 1656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2242.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 1412.8809814453125, "completions/mean_terminated_length": 1412.8809814453125, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 2.5809968847352023, "grad_norm": 0.5956932306289673, "kl": 0.04917511157691479, "learning_rate": 9.712499999999998e-07, "loss": -0.0002, "num_tokens": 215938549.0, "reward": 1.455902338027954, "reward_std": 0.05434668809175491, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45590224862098694, "rewards/correct_reward_func/std": 0.1384110152721405, "step": 1657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2975.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 1402.40478515625, "completions/mean_terminated_length": 1402.40478515625, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 2.582554517133956, "grad_norm": 0.6204903721809387, "kl": 0.049942273646593094, "learning_rate": 9.70625e-07, "loss": -0.0367, "num_tokens": 216062303.0, "reward": 1.4913082122802734, "reward_std": 0.04941417649388313, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4913082718849182, "rewards/correct_reward_func/std": 0.15925225615501404, "step": 1658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1427.9405517578125, "completions/mean_terminated_length": 1427.9405517578125, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "epoch": 2.5841121495327104, "grad_norm": 0.6375299096107483, "kl": 0.05218167416751385, "learning_rate": 9.7e-07, "loss": -0.0136, "num_tokens": 216188262.0, "reward": 1.4681142568588257, "reward_std": 0.046969909220933914, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4681141674518585, "rewards/correct_reward_func/std": 0.1534123718738556, "step": 1659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2112.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 1464.96435546875, "completions/mean_terminated_length": 1464.96435546875, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "epoch": 2.5856697819314642, "grad_norm": 0.6220073699951172, "kl": 0.0485758688300848, "learning_rate": 9.69375e-07, "loss": 0.0152, "num_tokens": 216317595.0, "reward": 1.5310637950897217, "reward_std": 0.05226462334394455, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5310637354850769, "rewards/correct_reward_func/std": 0.13809573650360107, "step": 1660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1459.1190185546875, "completions/mean_terminated_length": 1459.1190185546875, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 2.587227414330218, "grad_norm": 0.6240924000740051, "kl": 0.04917978122830391, "learning_rate": 9.6875e-07, "loss": -0.0099, "num_tokens": 216446077.0, "reward": 1.4470676183700562, "reward_std": 0.053474023938179016, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.447067528963089, "rewards/correct_reward_func/std": 0.1407385915517807, "step": 1661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 1417.4761962890625, "completions/mean_terminated_length": 1417.4761962890625, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 2.588785046728972, "grad_norm": 0.6181928515434265, "kl": 0.052147043868899345, "learning_rate": 9.68125e-07, "loss": -0.0202, "num_tokens": 216571103.0, "reward": 1.5402193069458008, "reward_std": 0.06166123226284981, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.540219247341156, "rewards/correct_reward_func/std": 0.1561741679906845, "step": 1662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1948.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 1407.2738037109375, "completions/mean_terminated_length": 1407.2738037109375, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 2.5903426791277258, "grad_norm": 0.627621591091156, "kl": 0.05073278583586216, "learning_rate": 9.675e-07, "loss": 0.0262, "num_tokens": 216695242.0, "reward": 1.4875036478042603, "reward_std": 0.07585971802473068, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4875035881996155, "rewards/correct_reward_func/std": 0.138135626912117, "step": 1663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1481.6429443359375, "completions/mean_terminated_length": 1481.6429443359375, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 2.5919003115264796, "grad_norm": 0.5519945025444031, "kl": 0.04902138374745846, "learning_rate": 9.66875e-07, "loss": -0.0195, "num_tokens": 216825772.0, "reward": 1.5137684345245361, "reward_std": 0.07194392383098602, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5256730914115906, "rewards/correct_reward_func/std": 0.1775560826063156, "step": 1664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1964.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 1407.952392578125, "completions/mean_terminated_length": 1407.952392578125, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 2.593457943925234, "grad_norm": 0.6002561450004578, "kl": 0.05004278011620045, "learning_rate": 9.6625e-07, "loss": 0.0398, "num_tokens": 216949812.0, "reward": 1.462935209274292, "reward_std": 0.08074159175157547, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47483980655670166, "rewards/correct_reward_func/std": 0.1654358208179474, "step": 1665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2073.0, "completions/max_terminated_length": 2073.0, "completions/mean_length": 1408.7261962890625, "completions/mean_terminated_length": 1408.7261962890625, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "epoch": 2.5950155763239877, "grad_norm": 0.5869240164756775, "kl": 0.04966321401298046, "learning_rate": 9.65625e-07, "loss": -0.0137, "num_tokens": 217074007.0, "reward": 1.533635139465332, "reward_std": 0.054338645190000534, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5336350202560425, "rewards/correct_reward_func/std": 0.12792766094207764, "step": 1666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2053.0, "completions/max_terminated_length": 2053.0, "completions/mean_length": 1390.4881591796875, "completions/mean_terminated_length": 1390.4881591796875, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 2.5965732087227416, "grad_norm": 0.5971348285675049, "kl": 0.050421176478266716, "learning_rate": 9.649999999999999e-07, "loss": -0.0109, "num_tokens": 217196652.0, "reward": 1.4888370037078857, "reward_std": 0.05267290771007538, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48883679509162903, "rewards/correct_reward_func/std": 0.1234746053814888, "step": 1667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 1553.5714111328125, "completions/mean_terminated_length": 1473.59033203125, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 2.5981308411214954, "grad_norm": 0.6165139675140381, "kl": 0.06656952388584614, "learning_rate": 9.64375e-07, "loss": 0.0593, "num_tokens": 217333386.0, "reward": 1.533720850944519, "reward_std": 0.0649760290980339, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5337207913398743, "rewards/correct_reward_func/std": 0.20481149852275848, "step": 1668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5341.0, "completions/max_terminated_length": 5341.0, "completions/mean_length": 1546.7857666015625, "completions/mean_terminated_length": 1546.7857666015625, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 2.5996884735202492, "grad_norm": 0.5679168701171875, "kl": 0.04792765714228153, "learning_rate": 9.637499999999999e-07, "loss": -0.0625, "num_tokens": 217469382.0, "reward": 1.502445101737976, "reward_std": 0.05674588307738304, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5024449229240417, "rewards/correct_reward_func/std": 0.1484416425228119, "step": 1669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2325.0, "completions/max_terminated_length": 2325.0, "completions/mean_length": 1447.0714111328125, "completions/mean_terminated_length": 1447.0714111328125, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 2.601246105919003, "grad_norm": 0.6210731863975525, "kl": 0.05022991634905338, "learning_rate": 9.63125e-07, "loss": -0.0105, "num_tokens": 217596942.0, "reward": 1.4871375560760498, "reward_std": 0.0627744123339653, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48713746666908264, "rewards/correct_reward_func/std": 0.1847596913576126, "step": 1670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2294.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 1469.7857666015625, "completions/mean_terminated_length": 1469.7857666015625, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.602803738317757, "grad_norm": 0.6269739866256714, "kl": 0.0508810393512249, "learning_rate": 9.624999999999999e-07, "loss": -0.0431, "num_tokens": 217726734.0, "reward": 1.5170916318893433, "reward_std": 0.06068253517150879, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5170915722846985, "rewards/correct_reward_func/std": 0.16057956218719482, "step": 1671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1418.6190185546875, "completions/mean_terminated_length": 1418.6190185546875, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "epoch": 2.6043613707165107, "grad_norm": 0.6334425806999207, "kl": 0.049596572294831276, "learning_rate": 9.61875e-07, "loss": 0.0241, "num_tokens": 217851676.0, "reward": 1.5022921562194824, "reward_std": 0.09006591141223907, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5141968727111816, "rewards/correct_reward_func/std": 0.1351170539855957, "step": 1672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 1443.8333740234375, "completions/mean_terminated_length": 1443.8333740234375, "completions/min_length": 1044.0, "completions/min_terminated_length": 1044.0, "epoch": 2.6059190031152646, "grad_norm": 0.6209561824798584, "kl": 0.05437706224620342, "learning_rate": 9.6125e-07, "loss": -0.0177, "num_tokens": 217978940.0, "reward": 1.5839701890945435, "reward_std": 0.09436364471912384, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.595875084400177, "rewards/correct_reward_func/std": 0.19169044494628906, "step": 1673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2021.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1438.96435546875, "completions/mean_terminated_length": 1438.96435546875, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "epoch": 2.6074766355140184, "grad_norm": 0.590447187423706, "kl": 0.0501092541962862, "learning_rate": 9.606249999999998e-07, "loss": 0.004, "num_tokens": 218105789.0, "reward": 1.5374349355697632, "reward_std": 0.05133926495909691, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5374348759651184, "rewards/correct_reward_func/std": 0.13656042516231537, "step": 1674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 1539.7857666015625, "completions/mean_terminated_length": 1539.7857666015625, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "epoch": 2.6090342679127727, "grad_norm": 0.6190969944000244, "kl": 0.051423005759716034, "learning_rate": 9.6e-07, "loss": 0.0122, "num_tokens": 218241173.0, "reward": 1.5287737846374512, "reward_std": 0.041881248354911804, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5287737846374512, "rewards/correct_reward_func/std": 0.20293985307216644, "step": 1675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1492.7857666015625, "completions/mean_terminated_length": 1492.7857666015625, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 2.6105919003115265, "grad_norm": 0.5812032222747803, "kl": 0.049530964344739914, "learning_rate": 9.59375e-07, "loss": 0.0171, "num_tokens": 218372627.0, "reward": 1.5548923015594482, "reward_std": 0.08977194875478745, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5667968988418579, "rewards/correct_reward_func/std": 0.18033479154109955, "step": 1676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1469.4761962890625, "completions/mean_terminated_length": 1469.4761962890625, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "epoch": 2.6121495327102804, "grad_norm": 0.5695353746414185, "kl": 0.05072716437280178, "learning_rate": 9.5875e-07, "loss": -0.0304, "num_tokens": 218502123.0, "reward": 1.504845380783081, "reward_std": 0.04729638993740082, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5048453211784363, "rewards/correct_reward_func/std": 0.11502755433320999, "step": 1677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 1477.96435546875, "completions/mean_terminated_length": 1477.96435546875, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 2.6137071651090342, "grad_norm": 0.5854849815368652, "kl": 0.04981553368270397, "learning_rate": 9.58125e-07, "loss": -0.0084, "num_tokens": 218632278.0, "reward": 1.4666979312896729, "reward_std": 0.05868620052933693, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46669769287109375, "rewards/correct_reward_func/std": 0.15134179592132568, "step": 1678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2561.0, "completions/max_terminated_length": 2561.0, "completions/mean_length": 1482.75, "completions/mean_terminated_length": 1482.75, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 2.615264797507788, "grad_norm": 0.5965787172317505, "kl": 0.051073240116238594, "learning_rate": 9.575e-07, "loss": 0.0123, "num_tokens": 218762901.0, "reward": 1.573923945426941, "reward_std": 0.08504657447338104, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5858286619186401, "rewards/correct_reward_func/std": 0.17917117476463318, "step": 1679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2542.0, "completions/max_terminated_length": 2542.0, "completions/mean_length": 1466.5595703125, "completions/mean_terminated_length": 1466.5595703125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 2.616822429906542, "grad_norm": 0.6152132749557495, "kl": 0.050383275374770164, "learning_rate": 9.56875e-07, "loss": -0.0041, "num_tokens": 218892008.0, "reward": 1.5407623052597046, "reward_std": 0.07131465524435043, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.552666962146759, "rewards/correct_reward_func/std": 0.16955766081809998, "step": 1680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2264.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 1481.9285888671875, "completions/mean_terminated_length": 1481.9285888671875, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 2.618380062305296, "grad_norm": 0.6503508687019348, "kl": 0.05008958280086517, "learning_rate": 9.5625e-07, "loss": -0.0133, "num_tokens": 219022382.0, "reward": 1.554307460784912, "reward_std": 0.0764685869216919, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5662122368812561, "rewards/correct_reward_func/std": 0.1505804806947708, "step": 1681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 1494.1785888671875, "completions/mean_terminated_length": 1494.1785888671875, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 2.61993769470405, "grad_norm": 0.6244161128997803, "kl": 0.05401036702096462, "learning_rate": 9.556249999999999e-07, "loss": 0.0179, "num_tokens": 219153929.0, "reward": 1.5588066577911377, "reward_std": 0.08443747460842133, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5707113742828369, "rewards/correct_reward_func/std": 0.12456187605857849, "step": 1682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2409.0, "completions/max_terminated_length": 2409.0, "completions/mean_length": 1520.25, "completions/mean_terminated_length": 1520.25, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 2.621495327102804, "grad_norm": 0.5478272438049316, "kl": 0.05052444525063038, "learning_rate": 9.55e-07, "loss": -0.0401, "num_tokens": 219287606.0, "reward": 1.5178083181381226, "reward_std": 0.05337538197636604, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5178082585334778, "rewards/correct_reward_func/std": 0.1795741617679596, "step": 1683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2405.0, "completions/max_terminated_length": 2405.0, "completions/mean_length": 1507.71435546875, "completions/mean_terminated_length": 1507.71435546875, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "epoch": 2.6230529595015577, "grad_norm": 0.5923721194267273, "kl": 0.05295459367334843, "learning_rate": 9.543749999999999e-07, "loss": 0.0024, "num_tokens": 219420254.0, "reward": 1.4730637073516846, "reward_std": 0.08490417897701263, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.496873140335083, "rewards/correct_reward_func/std": 0.1399519443511963, "step": 1684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 1499.15478515625, "completions/mean_terminated_length": 1499.15478515625, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "epoch": 2.6246105919003115, "grad_norm": 0.6285800337791443, "kl": 0.05046955123543739, "learning_rate": 9.5375e-07, "loss": -0.0083, "num_tokens": 219552009.0, "reward": 1.5439660549163818, "reward_std": 0.06823564320802689, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5439661145210266, "rewards/correct_reward_func/std": 0.12573818862438202, "step": 1685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 1517.5, "completions/mean_terminated_length": 1517.5, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 2.6261682242990654, "grad_norm": 0.6260651350021362, "kl": 0.05123108811676502, "learning_rate": 9.53125e-07, "loss": -0.0056, "num_tokens": 219685299.0, "reward": 1.5445849895477295, "reward_std": 0.0771394893527031, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5564895868301392, "rewards/correct_reward_func/std": 0.16129723191261292, "step": 1686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 1510.8690185546875, "completions/mean_terminated_length": 1510.8690185546875, "completions/min_length": 1077.0, "completions/min_terminated_length": 1077.0, "epoch": 2.627725856697819, "grad_norm": 0.6209577918052673, "kl": 0.05239488556981087, "learning_rate": 9.525e-07, "loss": 0.0076, "num_tokens": 219818260.0, "reward": 1.488200306892395, "reward_std": 0.09141159057617188, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5001050233840942, "rewards/correct_reward_func/std": 0.16971570253372192, "step": 1687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2809.0, "completions/max_terminated_length": 2809.0, "completions/mean_length": 1529.0238037109375, "completions/mean_terminated_length": 1529.0238037109375, "completions/min_length": 1065.0, "completions/min_terminated_length": 1065.0, "epoch": 2.629283489096573, "grad_norm": 0.5815672278404236, "kl": 0.052122319117188454, "learning_rate": 9.51875e-07, "loss": 0.0064, "num_tokens": 219952626.0, "reward": 1.5569831132888794, "reward_std": 0.06506761163473129, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5569829344749451, "rewards/correct_reward_func/std": 0.1598992496728897, "step": 1688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2324.0, "completions/max_terminated_length": 2324.0, "completions/mean_length": 1555.96435546875, "completions/mean_terminated_length": 1555.96435546875, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 2.630841121495327, "grad_norm": 0.5907567143440247, "kl": 0.05072435177862644, "learning_rate": 9.5125e-07, "loss": 0.02, "num_tokens": 220089447.0, "reward": 1.5207499265670776, "reward_std": 0.06055167689919472, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5207498669624329, "rewards/correct_reward_func/std": 0.13483476638793945, "step": 1689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 1482.7738037109375, "completions/mean_terminated_length": 1482.7738037109375, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 2.6323987538940807, "grad_norm": 0.6570085883140564, "kl": 0.052822982892394066, "learning_rate": 9.50625e-07, "loss": 0.0217, "num_tokens": 220219730.0, "reward": 1.5340451002120972, "reward_std": 0.07240943610668182, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5340450406074524, "rewards/correct_reward_func/std": 0.14005158841609955, "step": 1690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1566.5357666015625, "completions/mean_terminated_length": 1486.7108154296875, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 2.633956386292835, "grad_norm": 0.633683443069458, "kl": 0.05180642195045948, "learning_rate": 9.499999999999999e-07, "loss": 0.0508, "num_tokens": 220357301.0, "reward": 1.5094845294952393, "reward_std": 0.06991244107484818, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5094844102859497, "rewards/correct_reward_func/std": 0.1890960931777954, "step": 1691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2243.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 1476.9285888671875, "completions/mean_terminated_length": 1476.9285888671875, "completions/min_length": 1024.0, "completions/min_terminated_length": 1024.0, "epoch": 2.635514018691589, "grad_norm": 0.6175124645233154, "kl": 0.05089910887181759, "learning_rate": 9.493749999999999e-07, "loss": 0.0268, "num_tokens": 220487219.0, "reward": 1.5156797170639038, "reward_std": 0.12157899886369705, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5394890904426575, "rewards/correct_reward_func/std": 0.15065106749534607, "step": 1692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2328.0, "completions/max_terminated_length": 2328.0, "completions/mean_length": 1449.6905517578125, "completions/mean_terminated_length": 1449.6905517578125, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 2.6370716510903427, "grad_norm": 0.5715797543525696, "kl": 0.05218472704291344, "learning_rate": 9.487499999999999e-07, "loss": -0.0204, "num_tokens": 220614963.0, "reward": 1.500420331954956, "reward_std": 0.05844498798251152, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5004202723503113, "rewards/correct_reward_func/std": 0.11747856438159943, "step": 1693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1455.1785888671875, "completions/mean_terminated_length": 1455.1785888671875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.6386292834890965, "grad_norm": 0.6142057776451111, "kl": 0.051858896389603615, "learning_rate": 9.481249999999999e-07, "loss": -0.0014, "num_tokens": 220743294.0, "reward": 1.5874091386795044, "reward_std": 0.06715509295463562, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5874090790748596, "rewards/correct_reward_func/std": 0.17818407714366913, "step": 1694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2021.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 1434.511962890625, "completions/mean_terminated_length": 1434.511962890625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 2.6401869158878504, "grad_norm": 0.6616283059120178, "kl": 0.053675662726163864, "learning_rate": 9.474999999999999e-07, "loss": -0.0146, "num_tokens": 220869613.0, "reward": 1.51621675491333, "reward_std": 0.051551446318626404, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5162166357040405, "rewards/correct_reward_func/std": 0.15881773829460144, "step": 1695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2435.0, "completions/max_terminated_length": 2435.0, "completions/mean_length": 1533.797607421875, "completions/mean_terminated_length": 1533.797607421875, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 2.641744548286604, "grad_norm": 0.5601535439491272, "kl": 0.05052315630018711, "learning_rate": 9.468749999999999e-07, "loss": 0.0075, "num_tokens": 221004428.0, "reward": 1.4562889337539673, "reward_std": 0.08340126276016235, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4681936800479889, "rewards/correct_reward_func/std": 0.1466640830039978, "step": 1696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2554.0, "completions/max_terminated_length": 2554.0, "completions/mean_length": 1524.5595703125, "completions/mean_terminated_length": 1524.5595703125, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 2.6433021806853585, "grad_norm": 0.6289503574371338, "kl": 0.053342305123806, "learning_rate": 9.462499999999999e-07, "loss": 0.0037, "num_tokens": 221138605.0, "reward": 1.4486716985702515, "reward_std": 0.09186387062072754, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4605763256549835, "rewards/correct_reward_func/std": 0.11210845410823822, "step": 1697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5169.0, "completions/max_terminated_length": 5169.0, "completions/mean_length": 1572.916748046875, "completions/mean_terminated_length": 1572.916748046875, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 2.6448598130841123, "grad_norm": 0.57989501953125, "kl": 0.054382117465138435, "learning_rate": 9.45625e-07, "loss": -0.0002, "num_tokens": 221276778.0, "reward": 1.4662318229675293, "reward_std": 0.09466242790222168, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.49004119634628296, "rewards/correct_reward_func/std": 0.1867797076702118, "step": 1698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2587.0, "completions/max_terminated_length": 2587.0, "completions/mean_length": 1533.4881591796875, "completions/mean_terminated_length": 1533.4881591796875, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 2.646417445482866, "grad_norm": 0.5892370343208313, "kl": 0.0529037956148386, "learning_rate": 9.45e-07, "loss": 0.0026, "num_tokens": 221411717.0, "reward": 1.517157793045044, "reward_std": 0.05258752033114433, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.517157793045044, "rewards/correct_reward_func/std": 0.08600491285324097, "step": 1699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3820.0, "completions/max_terminated_length": 3820.0, "completions/mean_length": 1488.547607421875, "completions/mean_terminated_length": 1488.547607421875, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 2.64797507788162, "grad_norm": 0.5920478701591492, "kl": 0.05196198262274265, "learning_rate": 9.44375e-07, "loss": 0.0733, "num_tokens": 221542677.0, "reward": 1.5289138555526733, "reward_std": 0.07231435924768448, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5408185720443726, "rewards/correct_reward_func/std": 0.1479278802871704, "step": 1700 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 1448.1905517578125, "completions/mean_terminated_length": 1448.1905517578125, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 2.649532710280374, "grad_norm": 0.5726686120033264, "kl": 0.05282696709036827, "learning_rate": 9.4375e-07, "loss": 0.0043, "num_tokens": 221670247.0, "reward": 1.4904793500900269, "reward_std": 0.07221858203411102, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5023840665817261, "rewards/correct_reward_func/std": 0.09976700693368912, "step": 1701 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2215.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 1474.7738037109375, "completions/mean_terminated_length": 1474.7738037109375, "completions/min_length": 975.0, "completions/min_terminated_length": 975.0, "epoch": 2.6510903426791277, "grad_norm": 0.619105339050293, "kl": 0.054888103157281876, "learning_rate": 9.43125e-07, "loss": 0.0149, "num_tokens": 221799906.0, "reward": 1.482273817062378, "reward_std": 0.07818785309791565, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4941785931587219, "rewards/correct_reward_func/std": 0.12911812961101532, "step": 1702 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1433.0357666015625, "completions/mean_terminated_length": 1433.0357666015625, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 2.6526479750778815, "grad_norm": 0.6403220891952515, "kl": 0.05319024249911308, "learning_rate": 9.425e-07, "loss": 0.0032, "num_tokens": 221926167.0, "reward": 1.4578644037246704, "reward_std": 0.08385072648525238, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46976912021636963, "rewards/correct_reward_func/std": 0.14390592277050018, "step": 1703 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3067.0, "completions/mean_length": 1520.1309814453125, "completions/mean_terminated_length": 1439.7469482421875, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "epoch": 2.6542056074766354, "grad_norm": 0.5822752714157104, "kl": 0.05457857996225357, "learning_rate": 9.41875e-07, "loss": 0.0309, "num_tokens": 222059660.0, "reward": 1.511279582977295, "reward_std": 0.09577340632677078, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5231844186782837, "rewards/correct_reward_func/std": 0.15531422197818756, "step": 1704 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1473.8214111328125, "completions/mean_terminated_length": 1473.8214111328125, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 2.655763239875389, "grad_norm": 0.6206010580062866, "kl": 0.053228624165058136, "learning_rate": 9.4125e-07, "loss": 0.0109, "num_tokens": 222189407.0, "reward": 1.4744253158569336, "reward_std": 0.05794044956564903, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47442516684532166, "rewards/correct_reward_func/std": 0.2061844766139984, "step": 1705 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 1446.047607421875, "completions/mean_terminated_length": 1446.047607421875, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "epoch": 2.657320872274143, "grad_norm": 0.557413637638092, "kl": 0.05473394691944122, "learning_rate": 9.40625e-07, "loss": 0.0018, "num_tokens": 222316887.0, "reward": 1.4996709823608398, "reward_std": 0.10409435629844666, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5234804749488831, "rewards/correct_reward_func/std": 0.12513446807861328, "step": 1706 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2450.0, "completions/max_terminated_length": 2450.0, "completions/mean_length": 1426.90478515625, "completions/mean_terminated_length": 1426.90478515625, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 2.6588785046728973, "grad_norm": 0.6628433465957642, "kl": 0.05510305427014828, "learning_rate": 9.399999999999999e-07, "loss": 0.0216, "num_tokens": 222442681.0, "reward": 1.5184814929962158, "reward_std": 0.08003492653369904, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5303862690925598, "rewards/correct_reward_func/std": 0.15375916659832, "step": 1707 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3987.0, "completions/max_terminated_length": 3987.0, "completions/mean_length": 1480.9285888671875, "completions/mean_terminated_length": 1480.9285888671875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 2.660436137071651, "grad_norm": 0.6406999230384827, "kl": 0.05271622724831104, "learning_rate": 9.393749999999999e-07, "loss": 0.0022, "num_tokens": 222572989.0, "reward": 1.5534709692001343, "reward_std": 0.0656130462884903, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5534709692001343, "rewards/correct_reward_func/std": 0.1412557065486908, "step": 1708 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 1455.9285888671875, "completions/mean_terminated_length": 1455.9285888671875, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "epoch": 2.661993769470405, "grad_norm": 0.5892342925071716, "kl": 0.051750196143984795, "learning_rate": 9.387499999999999e-07, "loss": 0.0098, "num_tokens": 222701359.0, "reward": 1.5207828283309937, "reward_std": 0.04332602769136429, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5207826495170593, "rewards/correct_reward_func/std": 0.15455710887908936, "step": 1709 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2332.0, "completions/max_terminated_length": 2332.0, "completions/mean_length": 1452.857177734375, "completions/mean_terminated_length": 1452.857177734375, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 2.663551401869159, "grad_norm": 0.6459684371948242, "kl": 0.05080268904566765, "learning_rate": 9.381249999999999e-07, "loss": -0.0049, "num_tokens": 222829339.0, "reward": 1.4951261281967163, "reward_std": 0.04435892030596733, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4951260983943939, "rewards/correct_reward_func/std": 0.1211523562669754, "step": 1710 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2533.0, "completions/max_terminated_length": 2533.0, "completions/mean_length": 1458.297607421875, "completions/mean_terminated_length": 1458.297607421875, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 2.6651090342679127, "grad_norm": 0.5821453928947449, "kl": 0.05080693028867245, "learning_rate": 9.374999999999999e-07, "loss": 0.0174, "num_tokens": 222957992.0, "reward": 1.5405478477478027, "reward_std": 0.05841045826673508, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5405478477478027, "rewards/correct_reward_func/std": 0.1019287109375, "step": 1711 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2312.0, "completions/max_terminated_length": 2312.0, "completions/mean_length": 1452.84521484375, "completions/mean_terminated_length": 1452.84521484375, "completions/min_length": 962.0, "completions/min_terminated_length": 962.0, "epoch": 2.6666666666666665, "grad_norm": 0.5787824988365173, "kl": 0.05162036046385765, "learning_rate": 9.368749999999999e-07, "loss": -0.0168, "num_tokens": 223085899.0, "reward": 1.5507090091705322, "reward_std": 0.08447545766830444, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.562613844871521, "rewards/correct_reward_func/std": 0.16163305938243866, "step": 1712 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2091.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 1484.8214111328125, "completions/mean_terminated_length": 1484.8214111328125, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 2.668224299065421, "grad_norm": 0.605267345905304, "kl": 0.05258346535265446, "learning_rate": 9.3625e-07, "loss": 0.0042, "num_tokens": 223216732.0, "reward": 1.4944117069244385, "reward_std": 0.07867419719696045, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49441155791282654, "rewards/correct_reward_func/std": 0.12444284558296204, "step": 1713 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3119.0, "completions/max_terminated_length": 3119.0, "completions/mean_length": 1528.4761962890625, "completions/mean_terminated_length": 1528.4761962890625, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 2.6697819314641746, "grad_norm": 0.5905482769012451, "kl": 0.051737403497099876, "learning_rate": 9.35625e-07, "loss": 0.0238, "num_tokens": 223351064.0, "reward": 1.4821186065673828, "reward_std": 0.10998782515525818, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5059280395507812, "rewards/correct_reward_func/std": 0.1586100310087204, "step": 1714 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1518.1190185546875, "completions/mean_terminated_length": 1518.1190185546875, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "epoch": 2.6713395638629285, "grad_norm": 0.5762683749198914, "kl": 0.04895060881972313, "learning_rate": 9.35e-07, "loss": 0.0024, "num_tokens": 223484730.0, "reward": 1.4979010820388794, "reward_std": 0.10670863091945648, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.533615231513977, "rewards/correct_reward_func/std": 0.11758259683847427, "step": 1715 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1430.5357666015625, "completions/mean_terminated_length": 1430.5357666015625, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.6728971962616823, "grad_norm": 0.5974053740501404, "kl": 0.052313679829239845, "learning_rate": 9.34375e-07, "loss": -0.0182, "num_tokens": 223610775.0, "reward": 1.5079729557037354, "reward_std": 0.05624920129776001, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5079728960990906, "rewards/correct_reward_func/std": 0.19079400599002838, "step": 1716 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2091.0, "completions/max_terminated_length": 2091.0, "completions/mean_length": 1414.4405517578125, "completions/mean_terminated_length": 1414.4405517578125, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 2.674454828660436, "grad_norm": 0.6134198904037476, "kl": 0.05414976924657822, "learning_rate": 9.3375e-07, "loss": 0.0174, "num_tokens": 223735462.0, "reward": 1.542980432510376, "reward_std": 0.05245569720864296, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.542980432510376, "rewards/correct_reward_func/std": 0.18637017905712128, "step": 1717 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1447.3809814453125, "completions/mean_terminated_length": 1447.3809814453125, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 2.67601246105919, "grad_norm": 0.6158204674720764, "kl": 0.05258576385676861, "learning_rate": 9.33125e-07, "loss": -0.0129, "num_tokens": 223863000.0, "reward": 1.5099631547927856, "reward_std": 0.055979739874601364, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5099629759788513, "rewards/correct_reward_func/std": 0.15569455921649933, "step": 1718 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2591.0, "completions/max_terminated_length": 2591.0, "completions/mean_length": 1441.3333740234375, "completions/mean_terminated_length": 1441.3333740234375, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 2.677570093457944, "grad_norm": 0.572973370552063, "kl": 0.04859156347811222, "learning_rate": 9.325e-07, "loss": 0.0284, "num_tokens": 223989844.0, "reward": 1.5439099073410034, "reward_std": 0.046088360249996185, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5439099073410034, "rewards/correct_reward_func/std": 0.15077151358127594, "step": 1719 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 1415.46435546875, "completions/mean_terminated_length": 1415.46435546875, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "epoch": 2.6791277258566977, "grad_norm": 0.6280604600906372, "kl": 0.0512816458940506, "learning_rate": 9.31875e-07, "loss": 0.0197, "num_tokens": 224114527.0, "reward": 1.514798879623413, "reward_std": 0.07288581132888794, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5147988200187683, "rewards/correct_reward_func/std": 0.15633149445056915, "step": 1720 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2456.0, "completions/max_terminated_length": 2456.0, "completions/mean_length": 1413.4405517578125, "completions/mean_terminated_length": 1413.4405517578125, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 2.6806853582554515, "grad_norm": 0.6026427149772644, "kl": 0.050905462354421616, "learning_rate": 9.3125e-07, "loss": 0.0004, "num_tokens": 224239190.0, "reward": 1.516261100769043, "reward_std": 0.0839489996433258, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5162610411643982, "rewards/correct_reward_func/std": 0.16772376000881195, "step": 1721 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1474.0833740234375, "completions/mean_terminated_length": 1474.0833740234375, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 2.6822429906542054, "grad_norm": 0.5876814723014832, "kl": 0.050282981246709824, "learning_rate": 9.30625e-07, "loss": -0.0068, "num_tokens": 224368971.0, "reward": 1.5279390811920166, "reward_std": 0.03545699268579483, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5279390215873718, "rewards/correct_reward_func/std": 0.17711450159549713, "step": 1722 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2045.0, "completions/max_terminated_length": 2045.0, "completions/mean_length": 1448.25, "completions/mean_terminated_length": 1448.25, "completions/min_length": 952.0, "completions/min_terminated_length": 952.0, "epoch": 2.6838006230529596, "grad_norm": 0.5876829624176025, "kl": 0.04809461534023285, "learning_rate": 9.3e-07, "loss": -0.0171, "num_tokens": 224496882.0, "reward": 1.4794663190841675, "reward_std": 0.08256781846284866, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4913709759712219, "rewards/correct_reward_func/std": 0.14050987362861633, "step": 1723 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2243.0, "completions/max_terminated_length": 2243.0, "completions/mean_length": 1461.011962890625, "completions/mean_terminated_length": 1461.011962890625, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 2.6853582554517135, "grad_norm": 0.5967435836791992, "kl": 0.049721673130989075, "learning_rate": 9.293749999999999e-07, "loss": 0.0185, "num_tokens": 224625697.0, "reward": 1.5881257057189941, "reward_std": 0.06782446801662445, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5881257057189941, "rewards/correct_reward_func/std": 0.2112734615802765, "step": 1724 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 1448.8809814453125, "completions/mean_terminated_length": 1448.8809814453125, "completions/min_length": 556.0, "completions/min_terminated_length": 556.0, "epoch": 2.6869158878504673, "grad_norm": 0.6011800169944763, "kl": 0.05298084765672684, "learning_rate": 9.287499999999999e-07, "loss": 0.0173, "num_tokens": 224753409.0, "reward": 1.5033811330795288, "reward_std": 0.0700373575091362, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5152859091758728, "rewards/correct_reward_func/std": 0.138230562210083, "step": 1725 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2743.0, "completions/max_terminated_length": 2743.0, "completions/mean_length": 1493.9881591796875, "completions/mean_terminated_length": 1493.9881591796875, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 2.688473520249221, "grad_norm": 0.5747527480125427, "kl": 0.050631532445549965, "learning_rate": 9.281249999999999e-07, "loss": -0.0028, "num_tokens": 224884826.0, "reward": 1.5200514793395996, "reward_std": 0.07257713377475739, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5319561958312988, "rewards/correct_reward_func/std": 0.16938234865665436, "step": 1726 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2259.0, "completions/max_terminated_length": 2259.0, "completions/mean_length": 1511.96435546875, "completions/mean_terminated_length": 1511.96435546875, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 2.690031152647975, "grad_norm": 0.5667654871940613, "kl": 0.0497352909296751, "learning_rate": 9.274999999999999e-07, "loss": 0.0098, "num_tokens": 225017849.0, "reward": 1.4885200262069702, "reward_std": 0.050371404737234116, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48851993680000305, "rewards/correct_reward_func/std": 0.10705308616161346, "step": 1727 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 1504.011962890625, "completions/mean_terminated_length": 1504.011962890625, "completions/min_length": 1040.0, "completions/min_terminated_length": 1040.0, "epoch": 2.691588785046729, "grad_norm": 0.5516419410705566, "kl": 0.0479924101382494, "learning_rate": 9.268749999999999e-07, "loss": -0.0095, "num_tokens": 225150180.0, "reward": 1.5543620586395264, "reward_std": 0.051653556525707245, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5543619990348816, "rewards/correct_reward_func/std": 0.1859487146139145, "step": 1728 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1506.916748046875, "completions/mean_terminated_length": 1426.3734130859375, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 2.693146417445483, "grad_norm": 0.5902113914489746, "kl": 0.0502075869590044, "learning_rate": 9.2625e-07, "loss": 0.0834, "num_tokens": 225282557.0, "reward": 1.5599299669265747, "reward_std": 0.08853092044591904, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5599297285079956, "rewards/correct_reward_func/std": 0.15377002954483032, "step": 1729 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 1534.5357666015625, "completions/mean_terminated_length": 1534.5357666015625, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 2.694704049844237, "grad_norm": 0.5623704195022583, "kl": 0.04959819093346596, "learning_rate": 9.25625e-07, "loss": -0.0476, "num_tokens": 225417434.0, "reward": 1.5298880338668823, "reward_std": 0.04353261739015579, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5298879742622375, "rewards/correct_reward_func/std": 0.14602777361869812, "step": 1730 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 1474.9405517578125, "completions/mean_terminated_length": 1474.9405517578125, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "epoch": 2.696261682242991, "grad_norm": 0.5516695380210876, "kl": 0.050167109817266464, "learning_rate": 9.25e-07, "loss": -0.0214, "num_tokens": 225547401.0, "reward": 1.4920963048934937, "reward_std": 0.03700582683086395, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49209627509117126, "rewards/correct_reward_func/std": 0.14301958680152893, "step": 1731 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2820.0, "completions/max_terminated_length": 2820.0, "completions/mean_length": 1543.2261962890625, "completions/mean_terminated_length": 1543.2261962890625, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 2.6978193146417446, "grad_norm": 0.5902801752090454, "kl": 0.04959946498274803, "learning_rate": 9.243749999999999e-07, "loss": 0.0475, "num_tokens": 225683200.0, "reward": 1.5292655229568481, "reward_std": 0.049844931811094284, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5292653441429138, "rewards/correct_reward_func/std": 0.1660182923078537, "step": 1732 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2298.0, "completions/max_terminated_length": 2298.0, "completions/mean_length": 1530.8095703125, "completions/mean_terminated_length": 1530.8095703125, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "epoch": 2.6993769470404985, "grad_norm": 0.5620558261871338, "kl": 0.050421273335814476, "learning_rate": 9.237499999999999e-07, "loss": 0.0088, "num_tokens": 225817566.0, "reward": 1.5280489921569824, "reward_std": 0.05596272274851799, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5280489325523376, "rewards/correct_reward_func/std": 0.18441252410411835, "step": 1733 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2244.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 1490.6429443359375, "completions/mean_terminated_length": 1490.6429443359375, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "epoch": 2.7009345794392523, "grad_norm": 0.5768707394599915, "kl": 0.04820503666996956, "learning_rate": 9.23125e-07, "loss": 0.0152, "num_tokens": 225948822.0, "reward": 1.5188841819763184, "reward_std": 0.07099802047014236, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5307888984680176, "rewards/correct_reward_func/std": 0.1781437247991562, "step": 1734 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 1534.1429443359375, "completions/mean_terminated_length": 1534.1429443359375, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 2.702492211838006, "grad_norm": 0.5477608442306519, "kl": 0.04731915518641472, "learning_rate": 9.225e-07, "loss": 0.0014, "num_tokens": 226083738.0, "reward": 1.5319446325302124, "reward_std": 0.07295802980661392, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5438492894172668, "rewards/correct_reward_func/std": 0.1826002597808838, "step": 1735 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 1490.107177734375, "completions/mean_terminated_length": 1490.107177734375, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 2.70404984423676, "grad_norm": 0.5740139484405518, "kl": 0.047905333340168, "learning_rate": 9.21875e-07, "loss": -0.0232, "num_tokens": 226214877.0, "reward": 1.5207157135009766, "reward_std": 0.06020629033446312, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5207157135009766, "rewards/correct_reward_func/std": 0.139086052775383, "step": 1736 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 1454.261962890625, "completions/mean_terminated_length": 1454.261962890625, "completions/min_length": 989.0, "completions/min_terminated_length": 989.0, "epoch": 2.705607476635514, "grad_norm": 0.6038780808448792, "kl": 0.04983044043183327, "learning_rate": 9.2125e-07, "loss": -0.011, "num_tokens": 226343005.0, "reward": 1.5793057680130005, "reward_std": 0.06842636317014694, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5793058276176453, "rewards/correct_reward_func/std": 0.17215418815612793, "step": 1737 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2178.0, "completions/max_terminated_length": 2178.0, "completions/mean_length": 1417.6905517578125, "completions/mean_terminated_length": 1417.6905517578125, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "epoch": 2.7071651090342677, "grad_norm": 0.625968337059021, "kl": 0.05162692815065384, "learning_rate": 9.20625e-07, "loss": 0.0, "num_tokens": 226467839.0, "reward": 1.4611461162567139, "reward_std": 0.06528954952955246, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4611458480358124, "rewards/correct_reward_func/std": 0.1269480139017105, "step": 1738 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1509.107177734375, "completions/mean_terminated_length": 1509.107177734375, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 2.708722741433022, "grad_norm": 0.5869940519332886, "kl": 0.047593979164958, "learning_rate": 9.2e-07, "loss": 0.0065, "num_tokens": 226600622.0, "reward": 1.5243686437606812, "reward_std": 0.050143033266067505, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5243686437606812, "rewards/correct_reward_func/std": 0.18004749715328217, "step": 1739 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2353.0, "completions/max_terminated_length": 2353.0, "completions/mean_length": 1452.0, "completions/mean_terminated_length": 1452.0, "completions/min_length": 1038.0, "completions/min_terminated_length": 1038.0, "epoch": 2.710280373831776, "grad_norm": 0.5852388739585876, "kl": 0.05029440112411976, "learning_rate": 9.19375e-07, "loss": -0.0053, "num_tokens": 226728386.0, "reward": 1.5157904624938965, "reward_std": 0.07082771509885788, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5157904624938965, "rewards/correct_reward_func/std": 0.12376675754785538, "step": 1740 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2676.0, "completions/max_terminated_length": 2676.0, "completions/mean_length": 1455.511962890625, "completions/mean_terminated_length": 1455.511962890625, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 2.7118380062305296, "grad_norm": 0.615674614906311, "kl": 0.05315720476210117, "learning_rate": 9.187499999999999e-07, "loss": 0.0012, "num_tokens": 226856547.0, "reward": 1.4845690727233887, "reward_std": 0.09542543441057205, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4964737594127655, "rewards/correct_reward_func/std": 0.1251288503408432, "step": 1741 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2285.0, "completions/max_terminated_length": 2285.0, "completions/mean_length": 1564.357177734375, "completions/mean_terminated_length": 1564.357177734375, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 2.7133956386292835, "grad_norm": 0.5546826720237732, "kl": 0.04791397042572498, "learning_rate": 9.181249999999999e-07, "loss": -0.0168, "num_tokens": 226994031.0, "reward": 1.5511893033981323, "reward_std": 0.057545408606529236, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5511892437934875, "rewards/correct_reward_func/std": 0.1881408989429474, "step": 1742 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2649.0, "completions/max_terminated_length": 2649.0, "completions/mean_length": 1513.416748046875, "completions/mean_terminated_length": 1513.416748046875, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 2.7149532710280373, "grad_norm": 0.5818808078765869, "kl": 0.04986939579248428, "learning_rate": 9.174999999999999e-07, "loss": -0.0073, "num_tokens": 227127194.0, "reward": 1.5222913026809692, "reward_std": 0.09807135164737701, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.534196138381958, "rewards/correct_reward_func/std": 0.178168386220932, "step": 1743 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3181.0, "completions/mean_length": 1639.25, "completions/mean_terminated_length": 1560.3011474609375, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 2.716510903426791, "grad_norm": 0.5936155319213867, "kl": 0.047423213720321655, "learning_rate": 9.168749999999999e-07, "loss": 0.0511, "num_tokens": 227270957.0, "reward": 1.4892656803131104, "reward_std": 0.06924289464950562, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4892656207084656, "rewards/correct_reward_func/std": 0.2082015722990036, "step": 1744 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 1524.2857666015625, "completions/mean_terminated_length": 1524.2857666015625, "completions/min_length": 1047.0, "completions/min_terminated_length": 1047.0, "epoch": 2.7180685358255454, "grad_norm": 0.5865187048912048, "kl": 0.049725282937288284, "learning_rate": 9.1625e-07, "loss": -0.002, "num_tokens": 227405021.0, "reward": 1.46793794631958, "reward_std": 0.05386343598365784, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4679379165172577, "rewards/correct_reward_func/std": 0.13641268014907837, "step": 1745 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2233.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 1527.107177734375, "completions/mean_terminated_length": 1527.107177734375, "completions/min_length": 1030.0, "completions/min_terminated_length": 1030.0, "epoch": 2.7196261682242993, "grad_norm": 0.5725459456443787, "kl": 0.05072592943906784, "learning_rate": 9.15625e-07, "loss": -0.0133, "num_tokens": 227539352.0, "reward": 1.5217900276184082, "reward_std": 0.044926997274160385, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5217899680137634, "rewards/correct_reward_func/std": 0.14724591374397278, "step": 1746 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 1613.857177734375, "completions/mean_terminated_length": 1534.602294921875, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 2.721183800623053, "grad_norm": 0.5490879416465759, "kl": 0.04507457837462425, "learning_rate": 9.15e-07, "loss": 0.0343, "num_tokens": 227680928.0, "reward": 1.3781310319900513, "reward_std": 0.06081504002213478, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.3900357782840729, "rewards/correct_reward_func/std": 0.15293066203594208, "step": 1747 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 1463.952392578125, "completions/mean_terminated_length": 1463.952392578125, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "epoch": 2.722741433021807, "grad_norm": 0.5768812298774719, "kl": 0.0485483855009079, "learning_rate": 9.14375e-07, "loss": 0.0104, "num_tokens": 227809966.0, "reward": 1.5592997074127197, "reward_std": 0.07631435990333557, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5712043046951294, "rewards/correct_reward_func/std": 0.1735580861568451, "step": 1748 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3407.0, "completions/max_terminated_length": 3407.0, "completions/mean_length": 1658.34521484375, "completions/mean_terminated_length": 1658.34521484375, "completions/min_length": 1035.0, "completions/min_terminated_length": 1035.0, "epoch": 2.7242990654205608, "grad_norm": 0.5461453795433044, "kl": 0.04537799954414368, "learning_rate": 9.137499999999999e-07, "loss": 0.034, "num_tokens": 227955399.0, "reward": 1.5284554958343506, "reward_std": 0.05018966645002365, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5284554362297058, "rewards/correct_reward_func/std": 0.14444339275360107, "step": 1749 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2797.0, "completions/max_terminated_length": 2797.0, "completions/mean_length": 1593.65478515625, "completions/mean_terminated_length": 1593.65478515625, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 2.7258566978193146, "grad_norm": 0.5811620354652405, "kl": 0.04931554198265076, "learning_rate": 9.131249999999999e-07, "loss": 0.0091, "num_tokens": 228095182.0, "reward": 1.421472430229187, "reward_std": 0.08760947734117508, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43337711691856384, "rewards/correct_reward_func/std": 0.12807448208332062, "step": 1750 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2798.0, "completions/mean_length": 1613.3929443359375, "completions/mean_terminated_length": 1534.1324462890625, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "epoch": 2.7274143302180685, "grad_norm": 0.5985891222953796, "kl": 0.04628431983292103, "learning_rate": 9.124999999999999e-07, "loss": 0.056, "num_tokens": 228236689.0, "reward": 1.501530408859253, "reward_std": 0.11326522380113602, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5253397822380066, "rewards/correct_reward_func/std": 0.1827133297920227, "step": 1751 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2355.0, "completions/max_terminated_length": 2355.0, "completions/mean_length": 1465.9285888671875, "completions/mean_terminated_length": 1465.9285888671875, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "epoch": 2.7289719626168223, "grad_norm": 0.6031157970428467, "kl": 0.04876247979700565, "learning_rate": 9.11875e-07, "loss": -0.0206, "num_tokens": 228365749.0, "reward": 1.5216147899627686, "reward_std": 0.0717300996184349, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5216147303581238, "rewards/correct_reward_func/std": 0.15156029164791107, "step": 1752 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 1509.6309814453125, "completions/mean_terminated_length": 1509.6309814453125, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "epoch": 2.730529595015576, "grad_norm": 0.5991784930229187, "kl": 0.0491492785513401, "learning_rate": 9.1125e-07, "loss": 0.0326, "num_tokens": 228498684.0, "reward": 1.522566318511963, "reward_std": 0.0603664368391037, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5225663185119629, "rewards/correct_reward_func/std": 0.15381312370300293, "step": 1753 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1527.952392578125, "completions/mean_terminated_length": 1527.952392578125, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 2.73208722741433, "grad_norm": 0.542353093624115, "kl": 0.04700817912817001, "learning_rate": 9.10625e-07, "loss": 0.0052, "num_tokens": 228633092.0, "reward": 1.5241053104400635, "reward_std": 0.08161614090204239, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5360099673271179, "rewards/correct_reward_func/std": 0.15590722858905792, "step": 1754 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 1453.107177734375, "completions/mean_terminated_length": 1453.107177734375, "completions/min_length": 553.0, "completions/min_terminated_length": 553.0, "epoch": 2.7336448598130842, "grad_norm": 0.5914690494537354, "kl": 0.04987042024731636, "learning_rate": 9.1e-07, "loss": -0.0014, "num_tokens": 228760991.0, "reward": 1.4725608825683594, "reward_std": 0.04902815818786621, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47256073355674744, "rewards/correct_reward_func/std": 0.17008233070373535, "step": 1755 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2563.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 1640.59521484375, "completions/mean_terminated_length": 1640.59521484375, "completions/min_length": 1115.0, "completions/min_terminated_length": 1115.0, "epoch": 2.735202492211838, "grad_norm": 0.5368270874023438, "kl": 0.04745759256184101, "learning_rate": 9.09375e-07, "loss": 0.014, "num_tokens": 228904993.0, "reward": 1.5165960788726807, "reward_std": 0.0806044414639473, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5285007357597351, "rewards/correct_reward_func/std": 0.13134127855300903, "step": 1756 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 1498.297607421875, "completions/mean_terminated_length": 1498.297607421875, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "epoch": 2.736760124610592, "grad_norm": 0.5595915913581848, "kl": 0.04576588608324528, "learning_rate": 9.087499999999999e-07, "loss": -0.0156, "num_tokens": 229036886.0, "reward": 1.4800204038619995, "reward_std": 0.07226181030273438, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49192509055137634, "rewards/correct_reward_func/std": 0.12641641497612, "step": 1757 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 1478.5357666015625, "completions/mean_terminated_length": 1478.5357666015625, "completions/min_length": 1022.0, "completions/min_terminated_length": 1022.0, "epoch": 2.7383177570093458, "grad_norm": 0.6056258678436279, "kl": 0.04847019352018833, "learning_rate": 9.081249999999999e-07, "loss": 0.0055, "num_tokens": 229166933.0, "reward": 1.5236999988555908, "reward_std": 0.06552586704492569, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5237000584602356, "rewards/correct_reward_func/std": 0.1572173535823822, "step": 1758 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 1607.3929443359375, "completions/mean_terminated_length": 1528.0601806640625, "completions/min_length": 1121.0, "completions/min_terminated_length": 1121.0, "epoch": 2.7398753894080996, "grad_norm": 0.5343107581138611, "kl": 0.04823216423392296, "learning_rate": 9.074999999999999e-07, "loss": 0.0852, "num_tokens": 229307906.0, "reward": 1.4793072938919067, "reward_std": 0.06201612576842308, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47930726408958435, "rewards/correct_reward_func/std": 0.14715227484703064, "step": 1759 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 1554.6905517578125, "completions/mean_terminated_length": 1554.6905517578125, "completions/min_length": 1014.0, "completions/min_terminated_length": 1014.0, "epoch": 2.7414330218068534, "grad_norm": 0.5738732218742371, "kl": 0.047535086050629616, "learning_rate": 9.068749999999999e-07, "loss": -0.0099, "num_tokens": 229444440.0, "reward": 1.517982006072998, "reward_std": 0.05788485333323479, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.517982006072998, "rewards/correct_reward_func/std": 0.10987600684165955, "step": 1760 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2432.0, "completions/max_terminated_length": 2432.0, "completions/mean_length": 1525.797607421875, "completions/mean_terminated_length": 1525.797607421875, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 2.7429906542056077, "grad_norm": 0.5735669732093811, "kl": 0.049819137901067734, "learning_rate": 9.0625e-07, "loss": -0.0094, "num_tokens": 229578679.0, "reward": 1.46602201461792, "reward_std": 0.04789911210536957, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4660220444202423, "rewards/correct_reward_func/std": 0.1111576184630394, "step": 1761 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2481.0, "completions/max_terminated_length": 2481.0, "completions/mean_length": 1577.1190185546875, "completions/mean_terminated_length": 1577.1190185546875, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 2.7445482866043616, "grad_norm": 0.5863901972770691, "kl": 0.04598477482795715, "learning_rate": 9.05625e-07, "loss": -0.0033, "num_tokens": 229717217.0, "reward": 1.5324137210845947, "reward_std": 0.06905942410230637, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5324137806892395, "rewards/correct_reward_func/std": 0.13709920644760132, "step": 1762 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2640.0, "completions/max_terminated_length": 2640.0, "completions/mean_length": 1530.702392578125, "completions/mean_terminated_length": 1530.702392578125, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "epoch": 2.7461059190031154, "grad_norm": 0.6212528944015503, "kl": 0.049285657703876495, "learning_rate": 9.05e-07, "loss": 0.0175, "num_tokens": 229851784.0, "reward": 1.4829535484313965, "reward_std": 0.04065670445561409, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4829535186290741, "rewards/correct_reward_func/std": 0.13543906807899475, "step": 1763 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 1490.5714111328125, "completions/mean_terminated_length": 1490.5714111328125, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "epoch": 2.7476635514018692, "grad_norm": 0.5610176920890808, "kl": 0.04711009934544563, "learning_rate": 9.04375e-07, "loss": -0.0015, "num_tokens": 229982968.0, "reward": 1.53604257106781, "reward_std": 0.0722825676202774, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5360425114631653, "rewards/correct_reward_func/std": 0.1605040282011032, "step": 1764 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2158.0, "completions/max_terminated_length": 2158.0, "completions/mean_length": 1526.702392578125, "completions/mean_terminated_length": 1526.702392578125, "completions/min_length": 1077.0, "completions/min_terminated_length": 1077.0, "epoch": 2.749221183800623, "grad_norm": 0.5382264852523804, "kl": 0.04644685983657837, "learning_rate": 9.0375e-07, "loss": 0.0123, "num_tokens": 230117247.0, "reward": 1.5044747591018677, "reward_std": 0.0811743438243866, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5163795351982117, "rewards/correct_reward_func/std": 0.17006200551986694, "step": 1765 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1500.6309814453125, "completions/mean_terminated_length": 1500.6309814453125, "completions/min_length": 1034.0, "completions/min_terminated_length": 1034.0, "epoch": 2.750778816199377, "grad_norm": 0.5806080102920532, "kl": 0.04652276076376438, "learning_rate": 9.031249999999999e-07, "loss": 0.0131, "num_tokens": 230249330.0, "reward": 1.5374468564987183, "reward_std": 0.09140986949205399, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.549351692199707, "rewards/correct_reward_func/std": 0.1589287370443344, "step": 1766 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 1531.7738037109375, "completions/mean_terminated_length": 1531.7738037109375, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 2.7523364485981308, "grad_norm": 0.5608317852020264, "kl": 0.046816227957606316, "learning_rate": 9.024999999999999e-07, "loss": 0.0567, "num_tokens": 230383975.0, "reward": 1.5090358257293701, "reward_std": 0.08409078419208527, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5090356469154358, "rewards/correct_reward_func/std": 0.16797088086605072, "step": 1767 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2512.0, "completions/mean_length": 1600.8929443359375, "completions/mean_terminated_length": 1521.4818115234375, "completions/min_length": 1033.0, "completions/min_terminated_length": 1033.0, "epoch": 2.7538940809968846, "grad_norm": 0.5454899072647095, "kl": 0.04522669315338135, "learning_rate": 9.018749999999999e-07, "loss": 0.0393, "num_tokens": 230524546.0, "reward": 1.5638409852981567, "reward_std": 0.06667964160442352, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5638408660888672, "rewards/correct_reward_func/std": 0.17660924792289734, "step": 1768 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2136.0, "completions/max_terminated_length": 2136.0, "completions/mean_length": 1462.5238037109375, "completions/mean_terminated_length": 1462.5238037109375, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 2.7554517133956384, "grad_norm": 0.5854402184486389, "kl": 0.04746519774198532, "learning_rate": 9.0125e-07, "loss": -0.0129, "num_tokens": 230653398.0, "reward": 1.5523654222488403, "reward_std": 0.053342536091804504, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5523654222488403, "rewards/correct_reward_func/std": 0.1512749046087265, "step": 1769 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 1487.4881591796875, "completions/mean_terminated_length": 1487.4881591796875, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 2.7570093457943923, "grad_norm": 0.601487934589386, "kl": 0.04963444173336029, "learning_rate": 9.00625e-07, "loss": -0.0045, "num_tokens": 230784311.0, "reward": 1.4925949573516846, "reward_std": 0.06394613534212112, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49259480834007263, "rewards/correct_reward_func/std": 0.12653028964996338, "step": 1770 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 1460.797607421875, "completions/mean_terminated_length": 1460.797607421875, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 2.7585669781931466, "grad_norm": 0.595573902130127, "kl": 0.0474627111107111, "learning_rate": 9e-07, "loss": -0.0063, "num_tokens": 230912856.0, "reward": 1.4992130994796753, "reward_std": 0.05356660857796669, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4992130398750305, "rewards/correct_reward_func/std": 0.16943812370300293, "step": 1771 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2258.0, "completions/max_terminated_length": 2258.0, "completions/mean_length": 1469.7381591796875, "completions/mean_terminated_length": 1469.7381591796875, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "epoch": 2.7601246105919004, "grad_norm": 0.6372004747390747, "kl": 0.04596925899386406, "learning_rate": 8.99375e-07, "loss": 0.0135, "num_tokens": 231042200.0, "reward": 1.4725146293640137, "reward_std": 0.10624904185533524, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4844193160533905, "rewards/correct_reward_func/std": 0.1433822512626648, "step": 1772 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2031.0, "completions/max_terminated_length": 2031.0, "completions/mean_length": 1432.65478515625, "completions/mean_terminated_length": 1432.65478515625, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 2.7616822429906542, "grad_norm": 0.5866255760192871, "kl": 0.045393964275717735, "learning_rate": 8.9875e-07, "loss": -0.0106, "num_tokens": 231168333.0, "reward": 1.5332305431365967, "reward_std": 0.09277599304914474, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5451352000236511, "rewards/correct_reward_func/std": 0.1545981466770172, "step": 1773 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 1479.4405517578125, "completions/mean_terminated_length": 1479.4405517578125, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.763239875389408, "grad_norm": 0.5937835574150085, "kl": 0.047744568437337875, "learning_rate": 8.981249999999999e-07, "loss": 0.015, "num_tokens": 231298636.0, "reward": 1.480086326599121, "reward_std": 0.05732141435146332, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48008623719215393, "rewards/correct_reward_func/std": 0.1658886820077896, "step": 1774 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2017.0, "completions/max_terminated_length": 2017.0, "completions/mean_length": 1471.34521484375, "completions/mean_terminated_length": 1471.34521484375, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "epoch": 2.764797507788162, "grad_norm": 0.6031578779220581, "kl": 0.047263454645872116, "learning_rate": 8.974999999999999e-07, "loss": -0.0219, "num_tokens": 231428163.0, "reward": 1.534923791885376, "reward_std": 0.08724810928106308, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.54682856798172, "rewards/correct_reward_func/std": 0.13087236881256104, "step": 1775 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2331.0, "completions/max_terminated_length": 2331.0, "completions/mean_length": 1491.7857666015625, "completions/mean_terminated_length": 1491.7857666015625, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "epoch": 2.7663551401869158, "grad_norm": 0.6025930643081665, "kl": 0.04710867255926132, "learning_rate": 8.96875e-07, "loss": -0.0163, "num_tokens": 231559521.0, "reward": 1.5351918935775757, "reward_std": 0.052090905606746674, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5351918339729309, "rewards/correct_reward_func/std": 0.1890571415424347, "step": 1776 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2116.0, "completions/max_terminated_length": 2116.0, "completions/mean_length": 1539.297607421875, "completions/mean_terminated_length": 1539.297607421875, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 2.76791277258567, "grad_norm": 0.5954597592353821, "kl": 0.04720473289489746, "learning_rate": 8.9625e-07, "loss": 0.0173, "num_tokens": 231694852.0, "reward": 1.47535240650177, "reward_std": 0.05510924011468887, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4753521978855133, "rewards/correct_reward_func/std": 0.17776024341583252, "step": 1777 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 1498.8809814453125, "completions/mean_terminated_length": 1498.8809814453125, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 2.769470404984424, "grad_norm": 0.5676393508911133, "kl": 0.045732319355010986, "learning_rate": 8.95625e-07, "loss": 0.0043, "num_tokens": 231826914.0, "reward": 1.5926486253738403, "reward_std": 0.05302140861749649, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5926485657691956, "rewards/correct_reward_func/std": 0.14903192222118378, "step": 1778 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 1471.1190185546875, "completions/mean_terminated_length": 1471.1190185546875, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 2.7710280373831777, "grad_norm": 0.5938274264335632, "kl": 0.048066020011901855, "learning_rate": 8.95e-07, "loss": 0.0201, "num_tokens": 231956422.0, "reward": 1.5617477893829346, "reward_std": 0.061012960970401764, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5617477893829346, "rewards/correct_reward_func/std": 0.12701815366744995, "step": 1779 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1512.8929443359375, "completions/mean_terminated_length": 1512.8929443359375, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 2.7725856697819315, "grad_norm": 0.5453293919563293, "kl": 0.04835481941699982, "learning_rate": 8.94375e-07, "loss": 0.0087, "num_tokens": 232089595.0, "reward": 1.4647778272628784, "reward_std": 0.04078866168856621, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46477770805358887, "rewards/correct_reward_func/std": 0.13149945437908173, "step": 1780 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2215.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 1454.7738037109375, "completions/mean_terminated_length": 1454.7738037109375, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 2.7741433021806854, "grad_norm": 0.5911906957626343, "kl": 0.044606078416109085, "learning_rate": 8.9375e-07, "loss": 0.0371, "num_tokens": 232217844.0, "reward": 1.6045165061950684, "reward_std": 0.06208869442343712, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.6045165657997131, "rewards/correct_reward_func/std": 0.1938626915216446, "step": 1781 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2143.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 1408.9881591796875, "completions/mean_terminated_length": 1408.9881591796875, "completions/min_length": 718.0, "completions/min_terminated_length": 718.0, "epoch": 2.7757009345794392, "grad_norm": 0.5153990387916565, "kl": 0.04651484452188015, "learning_rate": 8.931249999999999e-07, "loss": 0.0066, "num_tokens": 232342025.0, "reward": 1.4958608150482178, "reward_std": 0.05716870725154877, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4958606958389282, "rewards/correct_reward_func/std": 0.16062328219413757, "step": 1782 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1409.702392578125, "completions/mean_terminated_length": 1409.702392578125, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 2.777258566978193, "grad_norm": 0.5813563466072083, "kl": 0.05050436221063137, "learning_rate": 8.924999999999999e-07, "loss": -0.0213, "num_tokens": 232466218.0, "reward": 1.469009280204773, "reward_std": 0.04477865621447563, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46900931000709534, "rewards/correct_reward_func/std": 0.10244528204202652, "step": 1783 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2245.0, "completions/max_terminated_length": 2245.0, "completions/mean_length": 1479.357177734375, "completions/mean_terminated_length": 1479.357177734375, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 2.778816199376947, "grad_norm": 0.6247825622558594, "kl": 0.047050368040800095, "learning_rate": 8.918749999999999e-07, "loss": -0.003, "num_tokens": 232596472.0, "reward": 1.5095137357711792, "reward_std": 0.04498210549354553, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5095137357711792, "rewards/correct_reward_func/std": 0.15733098983764648, "step": 1784 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 1447.107177734375, "completions/mean_terminated_length": 1447.107177734375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 2.7803738317757007, "grad_norm": 0.5777700543403625, "kl": 0.04822980798780918, "learning_rate": 8.912499999999999e-07, "loss": 0.0078, "num_tokens": 232724029.0, "reward": 1.5429933071136475, "reward_std": 0.05474673584103584, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5429931879043579, "rewards/correct_reward_func/std": 0.15942759811878204, "step": 1785 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2395.0, "completions/max_terminated_length": 2395.0, "completions/mean_length": 1451.0833740234375, "completions/mean_terminated_length": 1451.0833740234375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 2.7819314641744546, "grad_norm": 0.5709784030914307, "kl": 0.048276230692863464, "learning_rate": 8.906249999999999e-07, "loss": 0.0238, "num_tokens": 232851890.0, "reward": 1.5575634241104126, "reward_std": 0.05975179374217987, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5575634241104126, "rewards/correct_reward_func/std": 0.1885243058204651, "step": 1786 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2241.0, "completions/mean_length": 1446.4285888671875, "completions/mean_terminated_length": 1365.1566162109375, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 2.783489096573209, "grad_norm": 0.6158217787742615, "kl": 0.04661812633275986, "learning_rate": 8.9e-07, "loss": -0.0596, "num_tokens": 232979186.0, "reward": 1.570290207862854, "reward_std": 0.07791650295257568, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5702901482582092, "rewards/correct_reward_func/std": 0.1744825392961502, "step": 1787 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 1428.9285888671875, "completions/mean_terminated_length": 1428.9285888671875, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 2.7850467289719627, "grad_norm": 0.6107379794120789, "kl": 0.04876162111759186, "learning_rate": 8.89375e-07, "loss": -0.024, "num_tokens": 233105096.0, "reward": 1.440801978111267, "reward_std": 0.06258574873209, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4408018887042999, "rewards/correct_reward_func/std": 0.16484986245632172, "step": 1788 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 1500.0, "completions/mean_terminated_length": 1500.0, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "epoch": 2.7866043613707165, "grad_norm": 0.605813205242157, "kl": 0.04873121343553066, "learning_rate": 8.8875e-07, "loss": 0.0187, "num_tokens": 233237198.0, "reward": 1.5051733255386353, "reward_std": 0.06263089925050735, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5051731467247009, "rewards/correct_reward_func/std": 0.11007744818925858, "step": 1789 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1457.84521484375, "completions/mean_terminated_length": 1457.84521484375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "epoch": 2.7881619937694704, "grad_norm": 0.5985887050628662, "kl": 0.04958142526447773, "learning_rate": 8.88125e-07, "loss": 0.0239, "num_tokens": 233365585.0, "reward": 1.5312823057174683, "reward_std": 0.052604805678129196, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5312822461128235, "rewards/correct_reward_func/std": 0.16407187283039093, "step": 1790 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 1402.011962890625, "completions/mean_terminated_length": 1402.011962890625, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "epoch": 2.789719626168224, "grad_norm": 0.6014444828033447, "kl": 0.05011572316288948, "learning_rate": 8.874999999999999e-07, "loss": -0.0142, "num_tokens": 233489204.0, "reward": 1.5492042303085327, "reward_std": 0.060535624623298645, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5492041707038879, "rewards/correct_reward_func/std": 0.18691186606884003, "step": 1791 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 1514.6429443359375, "completions/mean_terminated_length": 1514.6429443359375, "completions/min_length": 1024.0, "completions/min_terminated_length": 1024.0, "epoch": 2.791277258566978, "grad_norm": 0.5926446914672852, "kl": 0.04787629656493664, "learning_rate": 8.86875e-07, "loss": 0.005, "num_tokens": 233622362.0, "reward": 1.4618563652038574, "reward_std": 0.04442992061376572, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4618563950061798, "rewards/correct_reward_func/std": 0.1323709785938263, "step": 1792 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 1385.25, "completions/mean_terminated_length": 1385.25, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 2.7928348909657323, "grad_norm": 0.629033625125885, "kl": 0.04952401854097843, "learning_rate": 8.8625e-07, "loss": -0.027, "num_tokens": 233744543.0, "reward": 1.4781965017318726, "reward_std": 0.08645133674144745, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4901011288166046, "rewards/correct_reward_func/std": 0.17877091467380524, "step": 1793 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 1477.0595703125, "completions/mean_terminated_length": 1477.0595703125, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 2.794392523364486, "grad_norm": 0.6132879257202148, "kl": 0.04768448695540428, "learning_rate": 8.85625e-07, "loss": 0.0372, "num_tokens": 233874682.0, "reward": 1.5762325525283813, "reward_std": 0.042931340634822845, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5762326121330261, "rewards/correct_reward_func/std": 0.19716502726078033, "step": 1794 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1508.7738037109375, "completions/mean_terminated_length": 1508.7738037109375, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 2.79595015576324, "grad_norm": 0.5930925607681274, "kl": 0.04705083183944225, "learning_rate": 8.85e-07, "loss": 0.0175, "num_tokens": 234007449.0, "reward": 1.6280020475387573, "reward_std": 0.061192672699689865, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.628001868724823, "rewards/correct_reward_func/std": 0.21653655171394348, "step": 1795 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1489.90478515625, "completions/mean_terminated_length": 1489.90478515625, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 2.797507788161994, "grad_norm": 0.5763041973114014, "kl": 0.04887812025845051, "learning_rate": 8.84375e-07, "loss": 0.0034, "num_tokens": 234138487.0, "reward": 1.4926257133483887, "reward_std": 0.05854571983218193, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49262556433677673, "rewards/correct_reward_func/std": 0.1880824863910675, "step": 1796 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1437.21435546875, "completions/mean_terminated_length": 1437.21435546875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 2.7990654205607477, "grad_norm": 0.5947045087814331, "kl": 0.04747645743191242, "learning_rate": 8.8375e-07, "loss": 0.0226, "num_tokens": 234265291.0, "reward": 1.5490565299987793, "reward_std": 0.051774367690086365, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5490564703941345, "rewards/correct_reward_func/std": 0.15908972918987274, "step": 1797 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1471.5595703125, "completions/mean_terminated_length": 1471.5595703125, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 2.8006230529595015, "grad_norm": 0.6101751923561096, "kl": 0.04977880045771599, "learning_rate": 8.83125e-07, "loss": 0.0157, "num_tokens": 234394998.0, "reward": 1.4809707403182983, "reward_std": 0.07007608562707901, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48097047209739685, "rewards/correct_reward_func/std": 0.12403425574302673, "step": 1798 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 1474.1309814453125, "completions/mean_terminated_length": 1474.1309814453125, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 2.8021806853582554, "grad_norm": 0.5902013778686523, "kl": 0.047758955508470535, "learning_rate": 8.824999999999999e-07, "loss": 0.0093, "num_tokens": 234524927.0, "reward": 1.5250070095062256, "reward_std": 0.09152919799089432, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.53691166639328, "rewards/correct_reward_func/std": 0.10868566483259201, "step": 1799 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2179.0, "completions/max_terminated_length": 2179.0, "completions/mean_length": 1495.6785888671875, "completions/mean_terminated_length": 1495.6785888671875, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 2.803738317757009, "grad_norm": 0.6023502349853516, "kl": 0.05121815763413906, "learning_rate": 8.818749999999999e-07, "loss": 0.0233, "num_tokens": 234656666.0, "reward": 1.5014182329177856, "reward_std": 0.08978351950645447, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5133229494094849, "rewards/correct_reward_func/std": 0.13750571012496948, "step": 1800 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1444.7381591796875, "completions/mean_terminated_length": 1444.7381591796875, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 2.805295950155763, "grad_norm": 0.5755165815353394, "kl": 0.04876223765313625, "learning_rate": 8.812499999999999e-07, "loss": 0.0285, "num_tokens": 234784048.0, "reward": 1.5259242057800293, "reward_std": 0.06941098719835281, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.525924026966095, "rewards/correct_reward_func/std": 0.15157389640808105, "step": 1801 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 1518.607177734375, "completions/mean_terminated_length": 1518.607177734375, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 2.806853582554517, "grad_norm": 0.5908817648887634, "kl": 0.04763074219226837, "learning_rate": 8.806249999999999e-07, "loss": 0.0203, "num_tokens": 234918007.0, "reward": 1.5190402269363403, "reward_std": 0.0945274755358696, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.53094482421875, "rewards/correct_reward_func/std": 0.18075872957706451, "step": 1802 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1465.797607421875, "completions/mean_terminated_length": 1465.797607421875, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 2.808411214953271, "grad_norm": 0.569840133190155, "kl": 0.047889675945043564, "learning_rate": 8.799999999999999e-07, "loss": -0.032, "num_tokens": 235047374.0, "reward": 1.516121745109558, "reward_std": 0.06012687459588051, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5161217451095581, "rewards/correct_reward_func/std": 0.17097163200378418, "step": 1803 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2475.0, "completions/max_terminated_length": 2475.0, "completions/mean_length": 1418.75, "completions/mean_terminated_length": 1418.75, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 2.809968847352025, "grad_norm": 0.5923731923103333, "kl": 0.04776529222726822, "learning_rate": 8.793749999999999e-07, "loss": 0.0021, "num_tokens": 235172717.0, "reward": 1.5301859378814697, "reward_std": 0.05031603202223778, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5301858186721802, "rewards/correct_reward_func/std": 0.11344287544488907, "step": 1804 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 1476.107177734375, "completions/mean_terminated_length": 1476.107177734375, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 2.811526479750779, "grad_norm": 0.5904235243797302, "kl": 0.05001424625515938, "learning_rate": 8.7875e-07, "loss": 0.0174, "num_tokens": 235302794.0, "reward": 1.461490273475647, "reward_std": 0.050949279218912125, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46149030327796936, "rewards/correct_reward_func/std": 0.14392763376235962, "step": 1805 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 1485.3929443359375, "completions/mean_terminated_length": 1485.3929443359375, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 2.8130841121495327, "grad_norm": 0.5896937847137451, "kl": 0.048910463228821754, "learning_rate": 8.78125e-07, "loss": -0.0, "num_tokens": 235433669.0, "reward": 1.4750986099243164, "reward_std": 0.06156722828745842, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4750983715057373, "rewards/correct_reward_func/std": 0.12749801576137543, "step": 1806 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1972.0, "completions/max_terminated_length": 1972.0, "completions/mean_length": 1426.9405517578125, "completions/mean_terminated_length": 1426.9405517578125, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 2.8146417445482865, "grad_norm": 0.6408569812774658, "kl": 0.04802324250340462, "learning_rate": 8.774999999999999e-07, "loss": 0.0183, "num_tokens": 235559376.0, "reward": 1.585871934890747, "reward_std": 0.0663520023226738, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5858718752861023, "rewards/correct_reward_func/std": 0.16547530889511108, "step": 1807 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2132.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 1433.90478515625, "completions/mean_terminated_length": 1433.90478515625, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 2.8161993769470404, "grad_norm": 0.601051390171051, "kl": 0.048677023500204086, "learning_rate": 8.76875e-07, "loss": 0.0003, "num_tokens": 235685650.0, "reward": 1.5155295133590698, "reward_std": 0.10393714159727097, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5274341702461243, "rewards/correct_reward_func/std": 0.2089490443468094, "step": 1808 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2071.0, "completions/max_terminated_length": 2071.0, "completions/mean_length": 1460.9881591796875, "completions/mean_terminated_length": 1460.9881591796875, "completions/min_length": 909.0, "completions/min_terminated_length": 909.0, "epoch": 2.8177570093457946, "grad_norm": 0.5767249464988708, "kl": 0.04596317559480667, "learning_rate": 8.7625e-07, "loss": 0.0228, "num_tokens": 235814265.0, "reward": 1.4487906694412231, "reward_std": 0.06374548375606537, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4487904906272888, "rewards/correct_reward_func/std": 0.13334602117538452, "step": 1809 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 1482.6309814453125, "completions/mean_terminated_length": 1482.6309814453125, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 2.8193146417445485, "grad_norm": 0.6088211536407471, "kl": 0.051502808928489685, "learning_rate": 8.75625e-07, "loss": 0.0039, "num_tokens": 235944794.0, "reward": 1.5229589939117432, "reward_std": 0.0628872960805893, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5229589939117432, "rewards/correct_reward_func/std": 0.1484093964099884, "step": 1810 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1424.2261962890625, "completions/mean_terminated_length": 1424.2261962890625, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 2.8208722741433023, "grad_norm": 0.6280210018157959, "kl": 0.047697125002741814, "learning_rate": 8.75e-07, "loss": -0.0107, "num_tokens": 236070435.0, "reward": 1.4353251457214355, "reward_std": 0.11402818560600281, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4591345191001892, "rewards/correct_reward_func/std": 0.134093776345253, "step": 1811 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1354.011962890625, "completions/mean_terminated_length": 1354.011962890625, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "epoch": 2.822429906542056, "grad_norm": 0.5937867164611816, "kl": 0.050918109714984894, "learning_rate": 8.74375e-07, "loss": 0.0236, "num_tokens": 236189944.0, "reward": 1.4966118335723877, "reward_std": 0.07176537811756134, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49661174416542053, "rewards/correct_reward_func/std": 0.1722252368927002, "step": 1812 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 1475.0238037109375, "completions/mean_terminated_length": 1475.0238037109375, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "epoch": 2.82398753894081, "grad_norm": 0.5849943161010742, "kl": 0.048153381794691086, "learning_rate": 8.7375e-07, "loss": 0.0123, "num_tokens": 236320014.0, "reward": 1.527652621269226, "reward_std": 0.054621584713459015, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5276524424552917, "rewards/correct_reward_func/std": 0.07925116270780563, "step": 1813 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 1532.8690185546875, "completions/mean_terminated_length": 1452.6385498046875, "completions/min_length": 1007.0, "completions/min_terminated_length": 1007.0, "epoch": 2.825545171339564, "grad_norm": 0.568009614944458, "kl": 0.04597240686416626, "learning_rate": 8.73125e-07, "loss": 0.0578, "num_tokens": 236454691.0, "reward": 1.4530938863754272, "reward_std": 0.09103947877883911, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4649985134601593, "rewards/correct_reward_func/std": 0.13021835684776306, "step": 1814 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 1459.0238037109375, "completions/mean_terminated_length": 1459.0238037109375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 2.8271028037383177, "grad_norm": 0.5875236392021179, "kl": 0.05096505954861641, "learning_rate": 8.725e-07, "loss": 0.0028, "num_tokens": 236582985.0, "reward": 1.400545358657837, "reward_std": 0.0968981683254242, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.43625959753990173, "rewards/correct_reward_func/std": 0.15029320120811462, "step": 1815 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 1421.2381591796875, "completions/mean_terminated_length": 1421.2381591796875, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 2.8286604361370715, "grad_norm": 0.5632856488227844, "kl": 0.04839196801185608, "learning_rate": 8.718749999999999e-07, "loss": 0.0091, "num_tokens": 236708375.0, "reward": 1.4060043096542358, "reward_std": 0.06501875817775726, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.41790908575057983, "rewards/correct_reward_func/std": 0.1103295162320137, "step": 1816 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2140.0, "completions/max_terminated_length": 2140.0, "completions/mean_length": 1402.857177734375, "completions/mean_terminated_length": 1402.857177734375, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 2.8302180685358254, "grad_norm": 0.601688027381897, "kl": 0.048357656225562096, "learning_rate": 8.712499999999999e-07, "loss": 0.0233, "num_tokens": 236832065.0, "reward": 1.5042697191238403, "reward_std": 0.03907506912946701, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5042697191238403, "rewards/correct_reward_func/std": 0.211466982960701, "step": 1817 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 1419.5238037109375, "completions/mean_terminated_length": 1419.5238037109375, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 2.831775700934579, "grad_norm": 0.5738736987113953, "kl": 0.04986536130309105, "learning_rate": 8.706249999999999e-07, "loss": -0.01, "num_tokens": 236957323.0, "reward": 1.451444149017334, "reward_std": 0.05272461101412773, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45144402980804443, "rewards/correct_reward_func/std": 0.1262628585100174, "step": 1818 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2311.0, "completions/max_terminated_length": 2311.0, "completions/mean_length": 1401.8214111328125, "completions/mean_terminated_length": 1401.8214111328125, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 2.8333333333333335, "grad_norm": 0.6245518326759338, "kl": 0.04795471578836441, "learning_rate": 8.699999999999999e-07, "loss": 0.0146, "num_tokens": 237080872.0, "reward": 1.507387638092041, "reward_std": 0.05351592227816582, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5073875784873962, "rewards/correct_reward_func/std": 0.14517466723918915, "step": 1819 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2229.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1398.65478515625, "completions/mean_terminated_length": 1398.65478515625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 2.8348909657320873, "grad_norm": 0.606451153755188, "kl": 0.04762054979801178, "learning_rate": 8.693749999999999e-07, "loss": 0.0046, "num_tokens": 237204119.0, "reward": 1.4582210779190063, "reward_std": 0.045032598078250885, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4582210183143616, "rewards/correct_reward_func/std": 0.19477683305740356, "step": 1820 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2316.0, "completions/max_terminated_length": 2316.0, "completions/mean_length": 1438.4881591796875, "completions/mean_terminated_length": 1438.4881591796875, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 2.836448598130841, "grad_norm": 0.5924787521362305, "kl": 0.04616494104266167, "learning_rate": 8.687499999999999e-07, "loss": 0.0108, "num_tokens": 237330922.0, "reward": 1.5436882972717285, "reward_std": 0.05805344507098198, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.543688178062439, "rewards/correct_reward_func/std": 0.16784033179283142, "step": 1821 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2287.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 1442.0238037109375, "completions/mean_terminated_length": 1442.0238037109375, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 2.838006230529595, "grad_norm": 0.577735960483551, "kl": 0.04895523563027382, "learning_rate": 8.681249999999999e-07, "loss": 0.0058, "num_tokens": 237457956.0, "reward": 1.4492849111557007, "reward_std": 0.07066318392753601, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4611895978450775, "rewards/correct_reward_func/std": 0.16167399287223816, "step": 1822 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1395.202392578125, "completions/mean_terminated_length": 1395.202392578125, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 2.839563862928349, "grad_norm": 0.6291164755821228, "kl": 0.050088051706552505, "learning_rate": 8.675000000000001e-07, "loss": 0.022, "num_tokens": 237581087.0, "reward": 1.46791410446167, "reward_std": 0.053007546812295914, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46791404485702515, "rewards/correct_reward_func/std": 0.11517284065485, "step": 1823 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 1361.0714111328125, "completions/mean_terminated_length": 1361.0714111328125, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 2.8411214953271027, "grad_norm": 0.5799793601036072, "kl": 0.050874266773462296, "learning_rate": 8.66875e-07, "loss": -0.0107, "num_tokens": 237701177.0, "reward": 1.5239530801773071, "reward_std": 0.05971672385931015, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.523952841758728, "rewards/correct_reward_func/std": 0.19533796608448029, "step": 1824 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 1354.4405517578125, "completions/mean_terminated_length": 1354.4405517578125, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 2.842679127725857, "grad_norm": 0.6134588122367859, "kl": 0.049529436975717545, "learning_rate": 8.6625e-07, "loss": 0.0091, "num_tokens": 237820692.0, "reward": 1.5182303190231323, "reward_std": 0.057895660400390625, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.518230140209198, "rewards/correct_reward_func/std": 0.14819766581058502, "step": 1825 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2284.0, "completions/max_terminated_length": 2284.0, "completions/mean_length": 1373.6905517578125, "completions/mean_terminated_length": 1373.6905517578125, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 2.844236760124611, "grad_norm": 0.6197590231895447, "kl": 0.04618115723133087, "learning_rate": 8.65625e-07, "loss": 0.0047, "num_tokens": 237941890.0, "reward": 1.5342316627502441, "reward_std": 0.0646398663520813, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5342316031455994, "rewards/correct_reward_func/std": 0.17176346480846405, "step": 1826 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2233.0, "completions/max_terminated_length": 2233.0, "completions/mean_length": 1390.9881591796875, "completions/mean_terminated_length": 1390.9881591796875, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 2.8457943925233646, "grad_norm": 0.6261128783226013, "kl": 0.04717477411031723, "learning_rate": 8.65e-07, "loss": 0.0125, "num_tokens": 238064571.0, "reward": 1.485521912574768, "reward_std": 0.0713748037815094, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48552176356315613, "rewards/correct_reward_func/std": 0.1153678148984909, "step": 1827 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3387.0, "completions/max_terminated_length": 3387.0, "completions/mean_length": 1561.3333740234375, "completions/mean_terminated_length": 1561.3333740234375, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "epoch": 2.8473520249221185, "grad_norm": 0.5480672121047974, "kl": 0.046291409060359, "learning_rate": 8.64375e-07, "loss": -0.0027, "num_tokens": 238201795.0, "reward": 1.4861243963241577, "reward_std": 0.08651053160429001, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4980289936065674, "rewards/correct_reward_func/std": 0.14908944070339203, "step": 1828 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 1468.2738037109375, "completions/mean_terminated_length": 1468.2738037109375, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 2.8489096573208723, "grad_norm": 0.544866681098938, "kl": 0.04725377634167671, "learning_rate": 8.6375e-07, "loss": -0.0208, "num_tokens": 238331334.0, "reward": 1.5554924011230469, "reward_std": 0.039573945105075836, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5554924011230469, "rewards/correct_reward_func/std": 0.1348913311958313, "step": 1829 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1446.8333740234375, "completions/mean_terminated_length": 1446.8333740234375, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 2.850467289719626, "grad_norm": 0.5850000977516174, "kl": 0.0464788768440485, "learning_rate": 8.63125e-07, "loss": 0.0426, "num_tokens": 238458910.0, "reward": 1.476670503616333, "reward_std": 0.04257005825638771, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.476670503616333, "rewards/correct_reward_func/std": 0.11883778125047684, "step": 1830 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1939.0, "completions/max_terminated_length": 1939.0, "completions/mean_length": 1421.297607421875, "completions/mean_terminated_length": 1421.297607421875, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 2.85202492211838, "grad_norm": 0.6033661365509033, "kl": 0.04749273508787155, "learning_rate": 8.625e-07, "loss": -0.0043, "num_tokens": 238584347.0, "reward": 1.518803358078003, "reward_std": 0.07845792174339294, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5307079553604126, "rewards/correct_reward_func/std": 0.16625280678272247, "step": 1831 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2279.0, "completions/max_terminated_length": 2279.0, "completions/mean_length": 1386.642822265625, "completions/mean_terminated_length": 1386.642822265625, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "epoch": 2.853582554517134, "grad_norm": 0.6366321444511414, "kl": 0.04707220196723938, "learning_rate": 8.618749999999999e-07, "loss": 0.0342, "num_tokens": 238706909.0, "reward": 1.5159767866134644, "reward_std": 0.062212228775024414, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5159767866134644, "rewards/correct_reward_func/std": 0.16013473272323608, "step": 1832 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2359.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 1458.96435546875, "completions/mean_terminated_length": 1458.96435546875, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "epoch": 2.8551401869158877, "grad_norm": 0.6155539155006409, "kl": 0.047059787437319756, "learning_rate": 8.612499999999999e-07, "loss": -0.0016, "num_tokens": 238835564.0, "reward": 1.5057066679000854, "reward_std": 0.07716460525989532, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5176112651824951, "rewards/correct_reward_func/std": 0.17705877125263214, "step": 1833 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 1424.666748046875, "completions/mean_terminated_length": 1424.666748046875, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 2.8566978193146415, "grad_norm": 0.5972130298614502, "kl": 0.04707394354045391, "learning_rate": 8.606249999999999e-07, "loss": 0.0117, "num_tokens": 238961392.0, "reward": 1.57952082157135, "reward_std": 0.03990659490227699, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5795207023620605, "rewards/correct_reward_func/std": 0.1201125830411911, "step": 1834 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2549.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 1425.0238037109375, "completions/mean_terminated_length": 1425.0238037109375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 2.858255451713396, "grad_norm": 0.5830649733543396, "kl": 0.04693341813981533, "learning_rate": 8.599999999999999e-07, "loss": 0.0232, "num_tokens": 239087022.0, "reward": 1.4896701574325562, "reward_std": 0.1098494753241539, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5134795308113098, "rewards/correct_reward_func/std": 0.13673284649848938, "step": 1835 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 1415.6905517578125, "completions/mean_terminated_length": 1415.6905517578125, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 2.8598130841121496, "grad_norm": 0.5900862216949463, "kl": 0.047275060787796974, "learning_rate": 8.593749999999999e-07, "loss": 0.0073, "num_tokens": 239211958.0, "reward": 1.5437850952148438, "reward_std": 0.06057523563504219, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5437849760055542, "rewards/correct_reward_func/std": 0.19467805325984955, "step": 1836 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1374.5714111328125, "completions/mean_terminated_length": 1374.5714111328125, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 2.8613707165109035, "grad_norm": 0.532691240310669, "kl": 0.04752357490360737, "learning_rate": 8.587499999999999e-07, "loss": -0.0079, "num_tokens": 239333308.0, "reward": 1.5296229124069214, "reward_std": 0.06542381644248962, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5296228528022766, "rewards/correct_reward_func/std": 0.1404719054698944, "step": 1837 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2483.0, "completions/max_terminated_length": 2483.0, "completions/mean_length": 1488.202392578125, "completions/mean_terminated_length": 1488.202392578125, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 2.8629283489096573, "grad_norm": 0.5592518448829651, "kl": 0.049193499609827995, "learning_rate": 8.581249999999999e-07, "loss": -0.0065, "num_tokens": 239464335.0, "reward": 1.4309169054031372, "reward_std": 0.08092854171991348, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4428216814994812, "rewards/correct_reward_func/std": 0.10967546701431274, "step": 1838 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2470.0, "completions/max_terminated_length": 2470.0, "completions/mean_length": 1368.6190185546875, "completions/mean_terminated_length": 1368.6190185546875, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "epoch": 2.864485981308411, "grad_norm": 0.5985156297683716, "kl": 0.048746079206466675, "learning_rate": 8.575e-07, "loss": 0.0214, "num_tokens": 239585053.0, "reward": 1.4367624521255493, "reward_std": 0.04547295719385147, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4367623031139374, "rewards/correct_reward_func/std": 0.16360631585121155, "step": 1839 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2169.0, "completions/max_terminated_length": 2169.0, "completions/mean_length": 1400.3809814453125, "completions/mean_terminated_length": 1400.3809814453125, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 2.866043613707165, "grad_norm": 0.5914815068244934, "kl": 0.04704183526337147, "learning_rate": 8.568750000000001e-07, "loss": 0.0144, "num_tokens": 239708685.0, "reward": 1.5398472547531128, "reward_std": 0.06250325590372086, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5398471355438232, "rewards/correct_reward_func/std": 0.14069853723049164, "step": 1840 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1455.761962890625, "completions/mean_terminated_length": 1455.761962890625, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 2.8676012461059193, "grad_norm": 0.6078109741210938, "kl": 0.04743264615535736, "learning_rate": 8.5625e-07, "loss": -0.0004, "num_tokens": 239836951.0, "reward": 1.5233697891235352, "reward_std": 0.056142520159482956, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5233696699142456, "rewards/correct_reward_func/std": 0.13059809803962708, "step": 1841 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 1506.21435546875, "completions/mean_terminated_length": 1425.66259765625, "completions/min_length": 645.0, "completions/min_terminated_length": 645.0, "epoch": 2.869158878504673, "grad_norm": 0.5849608778953552, "kl": 0.047081779688596725, "learning_rate": 8.55625e-07, "loss": 0.0131, "num_tokens": 239969431.0, "reward": 1.4787827730178833, "reward_std": 0.049712106585502625, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4787827134132385, "rewards/correct_reward_func/std": 0.10594594478607178, "step": 1842 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 1423.46435546875, "completions/mean_terminated_length": 1423.46435546875, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "epoch": 2.870716510903427, "grad_norm": 0.6069309711456299, "kl": 0.0469362698495388, "learning_rate": 8.55e-07, "loss": 0.0051, "num_tokens": 240094906.0, "reward": 1.488030195236206, "reward_std": 0.08242636919021606, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4999348819255829, "rewards/correct_reward_func/std": 0.1323889046907425, "step": 1843 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2116.0, "completions/max_terminated_length": 2116.0, "completions/mean_length": 1449.25, "completions/mean_terminated_length": 1449.25, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 2.872274143302181, "grad_norm": 0.5904936790466309, "kl": 0.04987756535410881, "learning_rate": 8.54375e-07, "loss": 0.0063, "num_tokens": 240222931.0, "reward": 1.5386792421340942, "reward_std": 0.05157402530312538, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5386791229248047, "rewards/correct_reward_func/std": 0.1543138027191162, "step": 1844 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1478.9761962890625, "completions/mean_terminated_length": 1478.9761962890625, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 2.8738317757009346, "grad_norm": 0.594489574432373, "kl": 0.04852667637169361, "learning_rate": 8.5375e-07, "loss": 0.0215, "num_tokens": 240353423.0, "reward": 1.564149022102356, "reward_std": 0.05337667465209961, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5641489028930664, "rewards/correct_reward_func/std": 0.16538885235786438, "step": 1845 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1408.107177734375, "completions/mean_terminated_length": 1408.107177734375, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 2.8753894080996885, "grad_norm": 0.5920664072036743, "kl": 0.04956894926726818, "learning_rate": 8.53125e-07, "loss": 0.0045, "num_tokens": 240477650.0, "reward": 1.5530227422714233, "reward_std": 0.05786222591996193, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.553022563457489, "rewards/correct_reward_func/std": 0.10814294964075089, "step": 1846 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 1453.9881591796875, "completions/mean_terminated_length": 1453.9881591796875, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 2.8769470404984423, "grad_norm": 0.5977966785430908, "kl": 0.0495617501437664, "learning_rate": 8.525e-07, "loss": 0.0167, "num_tokens": 240605659.0, "reward": 1.4735960960388184, "reward_std": 0.06539558619260788, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4735960364341736, "rewards/correct_reward_func/std": 0.16664306819438934, "step": 1847 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 1445.6785888671875, "completions/mean_terminated_length": 1445.6785888671875, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 2.878504672897196, "grad_norm": 0.5976069569587708, "kl": 0.046656979247927666, "learning_rate": 8.51875e-07, "loss": 0.0096, "num_tokens": 240733288.0, "reward": 1.4859282970428467, "reward_std": 0.07384907454252243, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4978329837322235, "rewards/correct_reward_func/std": 0.13206151127815247, "step": 1848 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3078.0, "completions/max_terminated_length": 3078.0, "completions/mean_length": 1397.011962890625, "completions/mean_terminated_length": 1397.011962890625, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 2.88006230529595, "grad_norm": 0.6243910193443298, "kl": 0.04769846796989441, "learning_rate": 8.512499999999999e-07, "loss": -0.0113, "num_tokens": 240856535.0, "reward": 1.5080736875534058, "reward_std": 0.07147131115198135, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5199784636497498, "rewards/correct_reward_func/std": 0.1378011256456375, "step": 1849 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 1492.8333740234375, "completions/mean_terminated_length": 1412.1204833984375, "completions/min_length": 885.0, "completions/min_terminated_length": 885.0, "epoch": 2.881619937694704, "grad_norm": 0.5973017811775208, "kl": 0.046514419838786125, "learning_rate": 8.506249999999999e-07, "loss": 0.0986, "num_tokens": 240987831.0, "reward": 1.5620269775390625, "reward_std": 0.06724628061056137, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.562026858329773, "rewards/correct_reward_func/std": 0.18270976841449738, "step": 1850 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2670.0, "completions/max_terminated_length": 2670.0, "completions/mean_length": 1383.357177734375, "completions/mean_terminated_length": 1383.357177734375, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 2.883177570093458, "grad_norm": 0.5846635103225708, "kl": 0.04700392670929432, "learning_rate": 8.499999999999999e-07, "loss": -0.0034, "num_tokens": 241110027.0, "reward": 1.5491745471954346, "reward_std": 0.08331379294395447, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5610793232917786, "rewards/correct_reward_func/std": 0.18074071407318115, "step": 1851 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1358.857177734375, "completions/mean_terminated_length": 1358.857177734375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 2.884735202492212, "grad_norm": 0.5922337174415588, "kl": 0.049716997891664505, "learning_rate": 8.493749999999999e-07, "loss": 0.0073, "num_tokens": 241230165.0, "reward": 1.5529330968856812, "reward_std": 0.08456330001354218, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5648377537727356, "rewards/correct_reward_func/std": 0.14533573389053345, "step": 1852 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2132.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 1409.8809814453125, "completions/mean_terminated_length": 1409.8809814453125, "completions/min_length": 771.0, "completions/min_terminated_length": 771.0, "epoch": 2.8862928348909658, "grad_norm": 0.6203679442405701, "kl": 0.050557443872094154, "learning_rate": 8.487499999999999e-07, "loss": -0.011, "num_tokens": 241354475.0, "reward": 1.513112187385559, "reward_std": 0.06728774309158325, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5131121277809143, "rewards/correct_reward_func/std": 0.159018412232399, "step": 1853 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1446.59521484375, "completions/mean_terminated_length": 1446.59521484375, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 2.8878504672897196, "grad_norm": 0.5875358581542969, "kl": 0.049160800874233246, "learning_rate": 8.481249999999999e-07, "loss": -0.0112, "num_tokens": 241482271.0, "reward": 1.514503836631775, "reward_std": 0.05262278392910957, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5145036578178406, "rewards/correct_reward_func/std": 0.12533026933670044, "step": 1854 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2139.0, "completions/max_terminated_length": 2139.0, "completions/mean_length": 1396.857177734375, "completions/mean_terminated_length": 1396.857177734375, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "epoch": 2.8894080996884735, "grad_norm": 0.6100218892097473, "kl": 0.04976900853216648, "learning_rate": 8.475e-07, "loss": -0.0161, "num_tokens": 241605571.0, "reward": 1.5430610179901123, "reward_std": 0.08355709165334702, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5549657940864563, "rewards/correct_reward_func/std": 0.15889394283294678, "step": 1855 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 1558.5714111328125, "completions/mean_terminated_length": 1478.6505126953125, "completions/min_length": 1022.0, "completions/min_terminated_length": 1022.0, "epoch": 2.8909657320872273, "grad_norm": 0.5747582316398621, "kl": 0.04804874211549759, "learning_rate": 8.46875e-07, "loss": 0.0629, "num_tokens": 241742575.0, "reward": 1.4551275968551636, "reward_std": 0.07719681411981583, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45512744784355164, "rewards/correct_reward_func/std": 0.14709919691085815, "step": 1856 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4266.0, "completions/max_terminated_length": 4266.0, "completions/mean_length": 1449.40478515625, "completions/mean_terminated_length": 1449.40478515625, "completions/min_length": 697.0, "completions/min_terminated_length": 697.0, "epoch": 2.8925233644859816, "grad_norm": 0.5797988176345825, "kl": 0.047118011862039566, "learning_rate": 8.462499999999999e-07, "loss": -0.0082, "num_tokens": 241870319.0, "reward": 1.445839285850525, "reward_std": 0.06089409440755844, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45774388313293457, "rewards/correct_reward_func/std": 0.1765856146812439, "step": 1857 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 1403.297607421875, "completions/mean_terminated_length": 1403.297607421875, "completions/min_length": 509.0, "completions/min_terminated_length": 509.0, "epoch": 2.8940809968847354, "grad_norm": 0.5940139293670654, "kl": 0.05095171742141247, "learning_rate": 8.45625e-07, "loss": 0.0528, "num_tokens": 241994136.0, "reward": 1.4712756872177124, "reward_std": 0.0887523666024208, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48318037390708923, "rewards/correct_reward_func/std": 0.17613986134529114, "step": 1858 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2095.0, "completions/max_terminated_length": 2095.0, "completions/mean_length": 1447.8214111328125, "completions/mean_terminated_length": 1447.8214111328125, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 2.8956386292834893, "grad_norm": 0.6246974468231201, "kl": 0.04900188185274601, "learning_rate": 8.45e-07, "loss": -0.0112, "num_tokens": 242121669.0, "reward": 1.5394078493118286, "reward_std": 0.04750818759202957, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5394078493118286, "rewards/correct_reward_func/std": 0.14948725700378418, "step": 1859 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 1447.84521484375, "completions/mean_terminated_length": 1447.84521484375, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 2.897196261682243, "grad_norm": 0.6124988794326782, "kl": 0.05242463760077953, "learning_rate": 8.44375e-07, "loss": -0.0019, "num_tokens": 242249186.0, "reward": 1.4994066953659058, "reward_std": 0.05752633884549141, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4994066655635834, "rewards/correct_reward_func/std": 0.16376963257789612, "step": 1860 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2370.0, "completions/max_terminated_length": 2370.0, "completions/mean_length": 1471.2261962890625, "completions/mean_terminated_length": 1471.2261962890625, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 2.898753894080997, "grad_norm": 0.572144091129303, "kl": 0.0477215014398098, "learning_rate": 8.4375e-07, "loss": 0.0153, "num_tokens": 242378853.0, "reward": 1.5255759954452515, "reward_std": 0.14635653793811798, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.5612902641296387, "rewards/correct_reward_func/std": 0.14039376378059387, "step": 1861 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 1520.3809814453125, "completions/mean_terminated_length": 1520.3809814453125, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 2.9003115264797508, "grad_norm": 0.5680839419364929, "kl": 0.0483478382229805, "learning_rate": 8.43125e-07, "loss": -0.0366, "num_tokens": 242512595.0, "reward": 1.4942126274108887, "reward_std": 0.07211074233055115, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5061174035072327, "rewards/correct_reward_func/std": 0.15040703117847443, "step": 1862 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1421.8095703125, "completions/mean_terminated_length": 1340.240966796875, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 2.9018691588785046, "grad_norm": 0.5870024561882019, "kl": 0.046427130699157715, "learning_rate": 8.425e-07, "loss": 0.0787, "num_tokens": 242637907.0, "reward": 1.4904773235321045, "reward_std": 0.119143545627594, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5142868757247925, "rewards/correct_reward_func/std": 0.14789333939552307, "step": 1863 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1423.8690185546875, "completions/mean_terminated_length": 1423.8690185546875, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 2.9034267912772584, "grad_norm": 0.5575277805328369, "kl": 0.050678523257374763, "learning_rate": 8.41875e-07, "loss": 0.0063, "num_tokens": 242763422.0, "reward": 1.5458518266677856, "reward_std": 0.045534782111644745, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5458517670631409, "rewards/correct_reward_func/std": 0.157304584980011, "step": 1864 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1933.0, "completions/max_terminated_length": 1933.0, "completions/mean_length": 1396.416748046875, "completions/mean_terminated_length": 1396.416748046875, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 2.9049844236760123, "grad_norm": 0.5825334191322327, "kl": 0.047192465513944626, "learning_rate": 8.4125e-07, "loss": -0.0043, "num_tokens": 242886451.0, "reward": 1.5070422887802124, "reward_std": 0.06566516309976578, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5070421695709229, "rewards/correct_reward_func/std": 0.11868878453969955, "step": 1865 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2531.0, "completions/max_terminated_length": 2531.0, "completions/mean_length": 1428.8333740234375, "completions/mean_terminated_length": 1428.8333740234375, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 2.906542056074766, "grad_norm": 0.6089467406272888, "kl": 0.05040416121482849, "learning_rate": 8.406249999999999e-07, "loss": 0.0131, "num_tokens": 243012479.0, "reward": 1.5890716314315796, "reward_std": 0.07957607507705688, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.6009762287139893, "rewards/correct_reward_func/std": 0.17113424837589264, "step": 1866 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2313.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 1474.71435546875, "completions/mean_terminated_length": 1474.71435546875, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 2.9080996884735204, "grad_norm": 0.623979926109314, "kl": 0.04805207625031471, "learning_rate": 8.399999999999999e-07, "loss": 0.0171, "num_tokens": 243142355.0, "reward": 1.4651129245758057, "reward_std": 0.09641896188259125, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4889222979545593, "rewards/correct_reward_func/std": 0.13223254680633545, "step": 1867 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2704.0, "completions/max_terminated_length": 2704.0, "completions/mean_length": 1468.40478515625, "completions/mean_terminated_length": 1468.40478515625, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 2.9096573208722742, "grad_norm": 0.5672589540481567, "kl": 0.04999713972210884, "learning_rate": 8.393749999999999e-07, "loss": -0.0138, "num_tokens": 243271779.0, "reward": 1.5421980619430542, "reward_std": 0.05601627007126808, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5421980023384094, "rewards/correct_reward_func/std": 0.15507298707962036, "step": 1868 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2690.0, "completions/max_terminated_length": 2690.0, "completions/mean_length": 1440.6785888671875, "completions/mean_terminated_length": 1440.6785888671875, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 2.911214953271028, "grad_norm": 0.6079015731811523, "kl": 0.04916231147944927, "learning_rate": 8.387499999999999e-07, "loss": -0.0097, "num_tokens": 243398616.0, "reward": 1.5027259588241577, "reward_std": 0.07206296920776367, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5146305561065674, "rewards/correct_reward_func/std": 0.11002402752637863, "step": 1869 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 1508.166748046875, "completions/mean_terminated_length": 1508.166748046875, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 2.912772585669782, "grad_norm": 0.5906843543052673, "kl": 0.05019880086183548, "learning_rate": 8.38125e-07, "loss": 0.0042, "num_tokens": 243531158.0, "reward": 1.5072680711746216, "reward_std": 0.07608100026845932, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5191727876663208, "rewards/correct_reward_func/std": 0.11280336230993271, "step": 1870 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2134.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1439.761962890625, "completions/mean_terminated_length": 1439.761962890625, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 2.9143302180685358, "grad_norm": 0.620227038860321, "kl": 0.04991860315203667, "learning_rate": 8.375e-07, "loss": 0.0195, "num_tokens": 243657858.0, "reward": 1.5205199718475342, "reward_std": 0.0658356100320816, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5205199718475342, "rewards/correct_reward_func/std": 0.15568125247955322, "step": 1871 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2400.0, "completions/max_terminated_length": 2400.0, "completions/mean_length": 1503.0595703125, "completions/mean_terminated_length": 1503.0595703125, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 2.9158878504672896, "grad_norm": 0.6337782740592957, "kl": 0.04720822721719742, "learning_rate": 8.36875e-07, "loss": 0.012, "num_tokens": 243790013.0, "reward": 1.5150138139724731, "reward_std": 0.1062217578291893, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5269186496734619, "rewards/correct_reward_func/std": 0.15477226674556732, "step": 1872 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 1486.3095703125, "completions/mean_terminated_length": 1486.3095703125, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 2.917445482866044, "grad_norm": 0.6096763014793396, "kl": 0.0476483479142189, "learning_rate": 8.3625e-07, "loss": 0.0168, "num_tokens": 243920857.0, "reward": 1.5651789903640747, "reward_std": 0.0608663409948349, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5651789903640747, "rewards/correct_reward_func/std": 0.17377328872680664, "step": 1873 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 1438.1429443359375, "completions/mean_terminated_length": 1438.1429443359375, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 2.9190031152647977, "grad_norm": 0.6152294874191284, "kl": 0.04864080063998699, "learning_rate": 8.356249999999999e-07, "loss": -0.0216, "num_tokens": 244047553.0, "reward": 1.5261359214782715, "reward_std": 0.07345680147409439, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5261358618736267, "rewards/correct_reward_func/std": 0.15716883540153503, "step": 1874 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2780.0, "completions/max_terminated_length": 2780.0, "completions/mean_length": 1576.3690185546875, "completions/mean_terminated_length": 1576.3690185546875, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 2.9205607476635516, "grad_norm": 0.5771212577819824, "kl": 0.04716803692281246, "learning_rate": 8.349999999999999e-07, "loss": 0.0252, "num_tokens": 244185998.0, "reward": 1.5080478191375732, "reward_std": 0.0463634692132473, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5080477595329285, "rewards/correct_reward_func/std": 0.17040875554084778, "step": 1875 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2096.0, "completions/max_terminated_length": 2096.0, "completions/mean_length": 1472.452392578125, "completions/mean_terminated_length": 1472.452392578125, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "epoch": 2.9221183800623054, "grad_norm": 0.5846139192581177, "kl": 0.050352565944194794, "learning_rate": 8.34375e-07, "loss": 0.0207, "num_tokens": 244315744.0, "reward": 1.491349697113037, "reward_std": 0.049779076129198074, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49134963750839233, "rewards/correct_reward_func/std": 0.184879869222641, "step": 1876 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2241.0, "completions/max_terminated_length": 2241.0, "completions/mean_length": 1485.416748046875, "completions/mean_terminated_length": 1485.416748046875, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 2.9236760124610592, "grad_norm": 0.5563156008720398, "kl": 0.04933321475982666, "learning_rate": 8.3375e-07, "loss": -0.0137, "num_tokens": 244446543.0, "reward": 1.5054928064346313, "reward_std": 0.08062034100294113, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5173975825309753, "rewards/correct_reward_func/std": 0.16196118295192719, "step": 1877 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1989.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1454.4285888671875, "completions/mean_terminated_length": 1454.4285888671875, "completions/min_length": 773.0, "completions/min_terminated_length": 773.0, "epoch": 2.925233644859813, "grad_norm": 0.5981301069259644, "kl": 0.050176067277789116, "learning_rate": 8.33125e-07, "loss": 0.0164, "num_tokens": 244574601.0, "reward": 1.542063593864441, "reward_std": 0.05000157281756401, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5420635938644409, "rewards/correct_reward_func/std": 0.2308300882577896, "step": 1878 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 1479.797607421875, "completions/mean_terminated_length": 1479.797607421875, "completions/min_length": 588.0, "completions/min_terminated_length": 588.0, "epoch": 2.926791277258567, "grad_norm": 0.5469790101051331, "kl": 0.04692430421710014, "learning_rate": 8.325e-07, "loss": -0.004, "num_tokens": 244704856.0, "reward": 1.543453574180603, "reward_std": 0.07569872587919235, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5553582310676575, "rewards/correct_reward_func/std": 0.14638683199882507, "step": 1879 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2629.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 1563.3333740234375, "completions/mean_terminated_length": 1563.3333740234375, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 2.9283489096573208, "grad_norm": 0.5495012998580933, "kl": 0.048657236620783806, "learning_rate": 8.31875e-07, "loss": -0.0018, "num_tokens": 244842374.0, "reward": 1.4648199081420898, "reward_std": 0.07421709597110748, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4767245352268219, "rewards/correct_reward_func/std": 0.14579324424266815, "step": 1880 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2014.0, "completions/max_terminated_length": 2014.0, "completions/mean_length": 1466.21435546875, "completions/mean_terminated_length": 1466.21435546875, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 2.9299065420560746, "grad_norm": 0.6079525947570801, "kl": 0.047798359766602516, "learning_rate": 8.3125e-07, "loss": 0.0236, "num_tokens": 244971596.0, "reward": 1.4787312746047974, "reward_std": 0.06719550490379333, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.478731244802475, "rewards/correct_reward_func/std": 0.16213369369506836, "step": 1881 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2114.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 1401.047607421875, "completions/mean_terminated_length": 1401.047607421875, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "epoch": 2.9314641744548284, "grad_norm": 0.598484992980957, "kl": 0.04947370104491711, "learning_rate": 8.306249999999999e-07, "loss": 0.0189, "num_tokens": 245095128.0, "reward": 1.457588791847229, "reward_std": 0.04164276272058487, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45758873224258423, "rewards/correct_reward_func/std": 0.1568526178598404, "step": 1882 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2071.0, "completions/max_terminated_length": 2071.0, "completions/mean_length": 1456.90478515625, "completions/mean_terminated_length": 1456.90478515625, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 2.9330218068535827, "grad_norm": 0.6279171705245972, "kl": 0.04823843948543072, "learning_rate": 8.299999999999999e-07, "loss": -0.0038, "num_tokens": 245223532.0, "reward": 1.5481948852539062, "reward_std": 0.06244068220257759, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5481947660446167, "rewards/correct_reward_func/std": 0.15312427282333374, "step": 1883 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1373.547607421875, "completions/mean_terminated_length": 1373.547607421875, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 2.9345794392523366, "grad_norm": 0.5992941856384277, "kl": 0.04740362986922264, "learning_rate": 8.293749999999999e-07, "loss": 0.0134, "num_tokens": 245344658.0, "reward": 1.558994174003601, "reward_std": 0.04547742009162903, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5589941143989563, "rewards/correct_reward_func/std": 0.18795959651470184, "step": 1884 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2415.0, "completions/max_terminated_length": 2415.0, "completions/mean_length": 1459.6905517578125, "completions/mean_terminated_length": 1459.6905517578125, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "epoch": 2.9361370716510904, "grad_norm": 0.6359865665435791, "kl": 0.04659297317266464, "learning_rate": 8.287499999999999e-07, "loss": 0.015, "num_tokens": 245473242.0, "reward": 1.4992262125015259, "reward_std": 0.0570068359375, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49922606348991394, "rewards/correct_reward_func/std": 0.12234224379062653, "step": 1885 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 1411.702392578125, "completions/mean_terminated_length": 1411.702392578125, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 2.9376947040498442, "grad_norm": 0.583466649055481, "kl": 0.046786656603217125, "learning_rate": 8.28125e-07, "loss": 0.0039, "num_tokens": 245597837.0, "reward": 1.566314697265625, "reward_std": 0.05086834356188774, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5663148164749146, "rewards/correct_reward_func/std": 0.12961094081401825, "step": 1886 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2516.0, "completions/max_terminated_length": 2516.0, "completions/mean_length": 1469.4405517578125, "completions/mean_terminated_length": 1469.4405517578125, "completions/min_length": 945.0, "completions/min_terminated_length": 945.0, "epoch": 2.939252336448598, "grad_norm": 0.5761610269546509, "kl": 0.04823177307844162, "learning_rate": 8.275e-07, "loss": -0.0292, "num_tokens": 245727318.0, "reward": 1.475521445274353, "reward_std": 0.06537957489490509, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47552135586738586, "rewards/correct_reward_func/std": 0.13384057581424713, "step": 1887 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2355.0, "completions/mean_length": 1604.7261962890625, "completions/mean_terminated_length": 1525.361328125, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 2.940809968847352, "grad_norm": 0.5015782117843628, "kl": 0.046176595613360405, "learning_rate": 8.26875e-07, "loss": 0.0504, "num_tokens": 245868223.0, "reward": 1.4730963706970215, "reward_std": 0.11941562592983246, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.49690574407577515, "rewards/correct_reward_func/std": 0.15850283205509186, "step": 1888 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3450.0, "completions/max_terminated_length": 3450.0, "completions/mean_length": 1443.297607421875, "completions/mean_terminated_length": 1443.297607421875, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 2.942367601246106, "grad_norm": 0.5957579016685486, "kl": 0.04620378278195858, "learning_rate": 8.2625e-07, "loss": 0.0003, "num_tokens": 245995412.0, "reward": 1.5307624340057373, "reward_std": 0.09587346762418747, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5545720458030701, "rewards/correct_reward_func/std": 0.14475734531879425, "step": 1889 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 1430.916748046875, "completions/mean_terminated_length": 1430.916748046875, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "epoch": 2.94392523364486, "grad_norm": 0.5724050998687744, "kl": 0.0482216402888298, "learning_rate": 8.25625e-07, "loss": -0.0015, "num_tokens": 246121555.0, "reward": 1.5074776411056519, "reward_std": 0.042747583240270615, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5074775218963623, "rewards/correct_reward_func/std": 0.11497288942337036, "step": 1890 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1402.6309814453125, "completions/mean_terminated_length": 1402.6309814453125, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 2.945482866043614, "grad_norm": 0.65523362159729, "kl": 0.04883107356727123, "learning_rate": 8.249999999999999e-07, "loss": -0.0104, "num_tokens": 246245334.0, "reward": 1.5146434307098389, "reward_std": 0.04966704174876213, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5146431922912598, "rewards/correct_reward_func/std": 0.1939820498228073, "step": 1891 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 1464.261962890625, "completions/mean_terminated_length": 1464.261962890625, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 2.9470404984423677, "grad_norm": 0.5518568754196167, "kl": 0.05018050596117973, "learning_rate": 8.243749999999999e-07, "loss": -0.0126, "num_tokens": 246374506.0, "reward": 1.5818240642547607, "reward_std": 0.04545382410287857, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5818238854408264, "rewards/correct_reward_func/std": 0.18372027575969696, "step": 1892 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 1463.96435546875, "completions/mean_terminated_length": 1463.96435546875, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 2.9485981308411215, "grad_norm": 0.6223868727684021, "kl": 0.0469142347574234, "learning_rate": 8.2375e-07, "loss": 0.0554, "num_tokens": 246503401.0, "reward": 1.5689200162887573, "reward_std": 0.08382821828126907, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5808245539665222, "rewards/correct_reward_func/std": 0.14941781759262085, "step": 1893 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1441.6429443359375, "completions/mean_terminated_length": 1441.6429443359375, "completions/min_length": 1047.0, "completions/min_terminated_length": 1047.0, "epoch": 2.9501557632398754, "grad_norm": 0.6775906085968018, "kl": 0.04657982662320137, "learning_rate": 8.23125e-07, "loss": 0.021, "num_tokens": 246630415.0, "reward": 1.5754626989364624, "reward_std": 0.06480978429317474, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5754626989364624, "rewards/correct_reward_func/std": 0.1700669229030609, "step": 1894 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2609.0, "completions/max_terminated_length": 2609.0, "completions/mean_length": 1477.8095703125, "completions/mean_terminated_length": 1477.8095703125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 2.9517133956386292, "grad_norm": 0.6518641710281372, "kl": 0.048412496224045753, "learning_rate": 8.225e-07, "loss": 0.0029, "num_tokens": 246760617.0, "reward": 1.5503627061843872, "reward_std": 0.06271228194236755, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5503626465797424, "rewards/correct_reward_func/std": 0.15744276344776154, "step": 1895 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2266.0, "completions/max_terminated_length": 2266.0, "completions/mean_length": 1379.8809814453125, "completions/mean_terminated_length": 1379.8809814453125, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 2.953271028037383, "grad_norm": 0.6140989065170288, "kl": 0.04764441028237343, "learning_rate": 8.21875e-07, "loss": 0.0202, "num_tokens": 246882317.0, "reward": 1.5056192874908447, "reward_std": 0.056696370244026184, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5056191682815552, "rewards/correct_reward_func/std": 0.15027913451194763, "step": 1896 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1429.7261962890625, "completions/mean_terminated_length": 1429.7261962890625, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "epoch": 2.954828660436137, "grad_norm": 0.6034258008003235, "kl": 0.0477066021412611, "learning_rate": 8.2125e-07, "loss": 0.0141, "num_tokens": 247008234.0, "reward": 1.5244462490081787, "reward_std": 0.08347533643245697, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5363509058952332, "rewards/correct_reward_func/std": 0.1700936257839203, "step": 1897 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1397.8214111328125, "completions/mean_terminated_length": 1397.8214111328125, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 2.9563862928348907, "grad_norm": 0.5953632593154907, "kl": 0.048819998279213905, "learning_rate": 8.20625e-07, "loss": 0.0321, "num_tokens": 247131591.0, "reward": 1.5548338890075684, "reward_std": 0.06181872636079788, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5548337697982788, "rewards/correct_reward_func/std": 0.1939641386270523, "step": 1898 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1444.7261962890625, "completions/mean_terminated_length": 1444.7261962890625, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 2.957943925233645, "grad_norm": 0.5956392884254456, "kl": 0.047058865427970886, "learning_rate": 8.199999999999999e-07, "loss": 0.0048, "num_tokens": 247258834.0, "reward": 1.5142505168914795, "reward_std": 0.08862974494695663, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5261551737785339, "rewards/correct_reward_func/std": 0.12416929751634598, "step": 1899 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1932.0, "completions/max_terminated_length": 1932.0, "completions/mean_length": 1401.547607421875, "completions/mean_terminated_length": 1401.547607421875, "completions/min_length": 783.0, "completions/min_terminated_length": 783.0, "epoch": 2.959501557632399, "grad_norm": 0.6269001960754395, "kl": 0.051041221246123314, "learning_rate": 8.193749999999999e-07, "loss": -0.0006, "num_tokens": 247382840.0, "reward": 1.5316612720489502, "reward_std": 0.06918644160032272, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5316612720489502, "rewards/correct_reward_func/std": 0.09987199306488037, "step": 1900 } ], "logging_steps": 1.0, "max_steps": 3210, "num_input_tokens_seen": 247382840, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }