diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16134 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0903426791277258, + "eval_steps": 500, + "global_step": 700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2517.0, + "completions/max_terminated_length": 2517.0, + "completions/mean_length": 1512.65478515625, + "completions/mean_terminated_length": 1512.65478515625, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "epoch": 0.001557632398753894, + "grad_norm": 0.602738082408905, + "kl": -8.884206703640984e-10, + "learning_rate": 0.0, + "loss": 0.02, + "num_tokens": 133045.0, + "reward": 1.3617119789123535, + "reward_std": 0.09446237236261368, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.36171185970306396, + "rewards/correct_reward_func/std": 0.15946270525455475, + "step": 1 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2520.0, + "completions/mean_length": 1677.65478515625, + "completions/mean_terminated_length": 1518.7681884765625, + "completions/min_length": 690.0, + "completions/min_terminated_length": 690.0, + "epoch": 0.003115264797507788, + "grad_norm": 0.5372695922851562, + "kl": -8.036803722522023e-10, + "learning_rate": 2e-07, + "loss": 0.0986, + "num_tokens": 279938.0, + "reward": 1.3327711820602417, + "reward_std": 0.11337035149335861, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.3327711820602417, + "rewards/correct_reward_func/std": 0.14508673548698425, + "step": 2 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2044.0, + "completions/mean_length": 1559.9881591796875, + "completions/mean_terminated_length": 1480.084228515625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.004672897196261682, + "grad_norm": 0.5770987868309021, + "kl": 0.0008140590216498822, + "learning_rate": 4e-07, + "loss": 0.0348, + "num_tokens": 417181.0, + "reward": 1.3511351346969604, + "reward_std": 0.12009123712778091, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.35113492608070374, + "rewards/correct_reward_func/std": 0.16792196035385132, + "step": 3 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2638.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 1605.8929443359375, + "completions/mean_terminated_length": 1605.8929443359375, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "epoch": 0.006230529595015576, + "grad_norm": 0.5473058223724365, + "kl": 0.0007717256958130747, + "learning_rate": 6e-07, + "loss": 0.0022, + "num_tokens": 557962.0, + "reward": 1.3706098794937134, + "reward_std": 0.13414135575294495, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.37060973048210144, + "rewards/correct_reward_func/std": 0.1871974617242813, + "step": 4 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2486.0, + "completions/mean_length": 1513.59521484375, + "completions/mean_terminated_length": 1433.1324462890625, + "completions/min_length": 314.0, + "completions/min_terminated_length": 314.0, + "epoch": 0.00778816199376947, + "grad_norm": 0.5818154811859131, + "kl": 0.0007767349597997963, + "learning_rate": 8e-07, + "loss": 0.0841, + "num_tokens": 690954.0, + "reward": 1.2946425676345825, + "reward_std": 0.20980872213840485, + "rewards/contains_chinese/mean": 0.9523809552192688, + "rewards/contains_chinese/std": 0.21423791348934174, + "rewards/correct_reward_func/mean": 0.342261403799057, + "rewards/correct_reward_func/std": 0.15122981369495392, + "step": 5 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2486.0, + "completions/max_terminated_length": 2486.0, + "completions/mean_length": 1510.3214111328125, + "completions/mean_terminated_length": 1510.3214111328125, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "epoch": 0.009345794392523364, + "grad_norm": 0.5755687355995178, + "kl": 0.0008145314350258559, + "learning_rate": 1e-06, + "loss": 0.0469, + "num_tokens": 823905.0, + "reward": 1.4106093645095825, + "reward_std": 0.12289178371429443, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.42251405119895935, + "rewards/correct_reward_func/std": 0.1738594025373459, + "step": 6 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2110.0, + "completions/max_terminated_length": 2110.0, + "completions/mean_length": 1465.047607421875, + "completions/mean_terminated_length": 1465.047607421875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.010903426791277258, + "grad_norm": 0.6340458393096924, + "kl": 0.0008423295512329787, + "learning_rate": 1.2e-06, + "loss": -0.0223, + "num_tokens": 952801.0, + "reward": 1.3219488859176636, + "reward_std": 0.14029237627983093, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.3338535726070404, + "rewards/correct_reward_func/std": 0.13986904919147491, + "step": 7 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2268.0, + "completions/max_terminated_length": 2268.0, + "completions/mean_length": 1477.011962890625, + "completions/mean_terminated_length": 1477.011962890625, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "epoch": 0.012461059190031152, + "grad_norm": 0.5733258128166199, + "kl": 0.0008218956645578146, + "learning_rate": 1.4e-06, + "loss": -0.0078, + "num_tokens": 1082942.0, + "reward": 1.3452098369598389, + "reward_std": 0.09600555151700974, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.3452097773551941, + "rewards/correct_reward_func/std": 0.13662667572498322, + "step": 8 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2478.0, + "completions/max_terminated_length": 2478.0, + "completions/mean_length": 1486.7261962890625, + "completions/mean_terminated_length": 1486.7261962890625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.014018691588785047, + "grad_norm": 0.5680725574493408, + "kl": 0.0008863781113177538, + "learning_rate": 1.6e-06, + "loss": -0.0153, + "num_tokens": 1213953.0, + "reward": 1.3956345319747925, + "reward_std": 0.13600240647792816, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.39563441276550293, + "rewards/correct_reward_func/std": 0.183233380317688, + "step": 9 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2198.0, + "completions/max_terminated_length": 2198.0, + "completions/mean_length": 1473.6429443359375, + "completions/mean_terminated_length": 1473.6429443359375, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "epoch": 0.01557632398753894, + "grad_norm": 0.5650473833084106, + "kl": 0.0009199154155794531, + "learning_rate": 1.8e-06, + "loss": 0.0136, + "num_tokens": 1343673.0, + "reward": 1.3652774095535278, + "reward_std": 0.0817384421825409, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.36527732014656067, + "rewards/correct_reward_func/std": 0.14138561487197876, + "step": 10 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2230.0, + "completions/mean_length": 1687.2857666015625, + "completions/mean_terminated_length": 1528.634033203125, + "completions/min_length": 917.0, + "completions/min_terminated_length": 917.0, + "epoch": 0.017133956386292833, + "grad_norm": 0.5159066915512085, + "kl": 0.0009192087745759636, + "learning_rate": 2e-06, + "loss": 0.0883, + "num_tokens": 1491351.0, + "reward": 1.4014300107955933, + "reward_std": 0.18983161449432373, + "rewards/contains_chinese/mean": 0.9523809552192688, + "rewards/contains_chinese/std": 0.21423791348934174, + "rewards/correct_reward_func/mean": 0.44904908537864685, + "rewards/correct_reward_func/std": 0.16164183616638184, + "step": 11 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2740.0, + "completions/max_terminated_length": 2740.0, + "completions/mean_length": 1522.107177734375, + "completions/mean_terminated_length": 1522.107177734375, + "completions/min_length": 274.0, + "completions/min_terminated_length": 274.0, + "epoch": 0.018691588785046728, + "grad_norm": 0.5924640893936157, + "kl": 0.0011004244443029165, + "learning_rate": 1.999375e-06, + "loss": -0.0201, + "num_tokens": 1625304.0, + "reward": 1.3764175176620483, + "reward_std": 0.11236605048179626, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.388322114944458, + "rewards/correct_reward_func/std": 0.14457714557647705, + "step": 12 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2378.0, + "completions/mean_length": 1624.9285888671875, + "completions/mean_terminated_length": 1464.756103515625, + "completions/min_length": 693.0, + "completions/min_terminated_length": 693.0, + "epoch": 0.020249221183800622, + "grad_norm": 0.5270693302154541, + "kl": 0.0012032188242301345, + "learning_rate": 1.99875e-06, + "loss": 0.0802, + "num_tokens": 1767936.0, + "reward": 1.3871831893920898, + "reward_std": 0.13983462750911713, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.3990878164768219, + "rewards/correct_reward_func/std": 0.1606336236000061, + "step": 13 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2426.0, + "completions/max_terminated_length": 2426.0, + "completions/mean_length": 1479.9761962890625, + "completions/mean_terminated_length": 1479.9761962890625, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "epoch": 0.021806853582554516, + "grad_norm": 0.6190818548202515, + "kl": 0.0014357012696564198, + "learning_rate": 1.998125e-06, + "loss": -0.0216, + "num_tokens": 1898164.0, + "reward": 1.3895180225372314, + "reward_std": 0.08238209784030914, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.38951802253723145, + "rewards/correct_reward_func/std": 0.11832733452320099, + "step": 14 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2150.0, + "completions/mean_length": 1475.0, + "completions/mean_terminated_length": 1394.072265625, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "epoch": 0.02336448598130841, + "grad_norm": 0.6059337258338928, + "kl": 0.0016431952244602144, + "learning_rate": 1.9975e-06, + "loss": 0.0526, + "num_tokens": 2027914.0, + "reward": 1.3913025856018066, + "reward_std": 0.1798313409090042, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669146299362183, + "rewards/correct_reward_func/mean": 0.4270167648792267, + "rewards/correct_reward_func/std": 0.15607501566410065, + "step": 15 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1945.0, + "completions/max_terminated_length": 1945.0, + "completions/mean_length": 1430.6785888671875, + "completions/mean_terminated_length": 1430.6785888671875, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "epoch": 0.024922118380062305, + "grad_norm": 0.5683785676956177, + "kl": 0.0018854692461900413, + "learning_rate": 1.996875e-06, + "loss": 0.0171, + "num_tokens": 2153959.0, + "reward": 1.3746126890182495, + "reward_std": 0.11688078194856644, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.3746126592159271, + "rewards/correct_reward_func/std": 0.16250161826610565, + "step": 16 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2076.0, + "completions/max_terminated_length": 2076.0, + "completions/mean_length": 1465.607177734375, + "completions/mean_terminated_length": 1465.607177734375, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.0264797507788162, + "grad_norm": 0.5914926528930664, + "kl": 0.002100524492561817, + "learning_rate": 1.99625e-06, + "loss": 0.0093, + "num_tokens": 2283034.0, + "reward": 1.3505299091339111, + "reward_std": 0.10699693858623505, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.35052984952926636, + "rewards/correct_reward_func/std": 0.13587050139904022, + "step": 17 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2083.0, + "completions/max_terminated_length": 2083.0, + "completions/mean_length": 1521.297607421875, + "completions/mean_terminated_length": 1521.297607421875, + "completions/min_length": 1084.0, + "completions/min_terminated_length": 1084.0, + "epoch": 0.028037383177570093, + "grad_norm": 0.5633271336555481, + "kl": 0.002299150452017784, + "learning_rate": 1.995625e-06, + "loss": 0.0163, + "num_tokens": 2416793.0, + "reward": 1.3485947847366333, + "reward_std": 0.12012340128421783, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.3485947251319885, + "rewards/correct_reward_func/std": 0.15519553422927856, + "step": 18 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2358.0, + "completions/max_terminated_length": 2358.0, + "completions/mean_length": 1465.702392578125, + "completions/mean_terminated_length": 1465.702392578125, + "completions/min_length": 853.0, + "completions/min_terminated_length": 853.0, + "epoch": 0.029595015576323987, + "grad_norm": 0.5992788076400757, + "kl": 0.002645128988660872, + "learning_rate": 1.995e-06, + "loss": 0.0093, + "num_tokens": 2545768.0, + "reward": 1.4050683975219727, + "reward_std": 0.09077386558055878, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4050683081150055, + "rewards/correct_reward_func/std": 0.1320529729127884, + "step": 19 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2399.0, + "completions/max_terminated_length": 2399.0, + "completions/mean_length": 1532.8929443359375, + "completions/mean_terminated_length": 1532.8929443359375, + "completions/min_length": 966.0, + "completions/min_terminated_length": 966.0, + "epoch": 0.03115264797507788, + "grad_norm": 0.5546616315841675, + "kl": 0.00307619187515229, + "learning_rate": 1.994375e-06, + "loss": 0.0061, + "num_tokens": 2680495.0, + "reward": 1.4120489358901978, + "reward_std": 0.0814485251903534, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.41204896569252014, + "rewards/correct_reward_func/std": 0.14482632279396057, + "step": 20 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2494.0, + "completions/mean_length": 1707.9285888671875, + "completions/mean_terminated_length": 1549.7803955078125, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "epoch": 0.03271028037383177, + "grad_norm": 0.588342010974884, + "kl": 0.0031734263757243752, + "learning_rate": 1.9937499999999998e-06, + "loss": 0.0697, + "num_tokens": 2830129.0, + "reward": 1.336440920829773, + "reward_std": 0.10719747841358185, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.3364408016204834, + "rewards/correct_reward_func/std": 0.1317695528268814, + "step": 21 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2244.0, + "completions/max_terminated_length": 2244.0, + "completions/mean_length": 1420.0238037109375, + "completions/mean_terminated_length": 1420.0238037109375, + "completions/min_length": 829.0, + "completions/min_terminated_length": 829.0, + "epoch": 0.03426791277258567, + "grad_norm": 0.5817105770111084, + "kl": 0.003916586167179048, + "learning_rate": 1.993125e-06, + "loss": -0.0456, + "num_tokens": 2955267.0, + "reward": 1.4114601612091064, + "reward_std": 0.1481117159128189, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.43526971340179443, + "rewards/correct_reward_func/std": 0.13317571580410004, + "step": 22 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2074.0, + "completions/max_terminated_length": 2074.0, + "completions/mean_length": 1391.2738037109375, + "completions/mean_terminated_length": 1391.2738037109375, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "epoch": 0.03582554517133956, + "grad_norm": 0.6308074593544006, + "kl": 0.004312207689508796, + "learning_rate": 1.9925e-06, + "loss": -0.0213, + "num_tokens": 3078014.0, + "reward": 1.37629234790802, + "reward_std": 0.16423028707504272, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4001017212867737, + "rewards/correct_reward_func/std": 0.18709857761859894, + "step": 23 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2132.0, + "completions/max_terminated_length": 2132.0, + "completions/mean_length": 1513.857177734375, + "completions/mean_terminated_length": 1513.857177734375, + "completions/min_length": 849.0, + "completions/min_terminated_length": 849.0, + "epoch": 0.037383177570093455, + "grad_norm": 0.5661871433258057, + "kl": 0.004699907032772899, + "learning_rate": 1.991875e-06, + "loss": -0.0204, + "num_tokens": 3211100.0, + "reward": 1.384656310081482, + "reward_std": 0.06906478852033615, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.38465628027915955, + "rewards/correct_reward_func/std": 0.13333608210086823, + "step": 24 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2097.0, + "completions/max_terminated_length": 2097.0, + "completions/mean_length": 1553.4761962890625, + "completions/mean_terminated_length": 1553.4761962890625, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "epoch": 0.03894080996884735, + "grad_norm": 0.5596578121185303, + "kl": 0.0051078200340271, + "learning_rate": 1.9912499999999998e-06, + "loss": -0.0025, + "num_tokens": 3347538.0, + "reward": 1.4165700674057007, + "reward_std": 0.0908581092953682, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4165700376033783, + "rewards/correct_reward_func/std": 0.11516361683607101, + "step": 25 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2070.0, + "completions/mean_length": 1607.5357666015625, + "completions/mean_terminated_length": 1528.2047119140625, + "completions/min_length": 533.0, + "completions/min_terminated_length": 533.0, + "epoch": 0.040498442367601244, + "grad_norm": 0.5141485929489136, + "kl": 0.005483957007527351, + "learning_rate": 1.990625e-06, + "loss": 0.0479, + "num_tokens": 3488835.0, + "reward": 1.3931559324264526, + "reward_std": 0.09568320959806442, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.3931559920310974, + "rewards/correct_reward_func/std": 0.15160411596298218, + "step": 26 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2595.0, + "completions/max_terminated_length": 2595.0, + "completions/mean_length": 1509.1429443359375, + "completions/mean_terminated_length": 1509.1429443359375, + "completions/min_length": 955.0, + "completions/min_terminated_length": 955.0, + "epoch": 0.04205607476635514, + "grad_norm": 0.5826243758201599, + "kl": 0.006127089960500598, + "learning_rate": 1.99e-06, + "loss": 0.0232, + "num_tokens": 3621663.0, + "reward": 1.3912872076034546, + "reward_std": 0.09357985109090805, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.3912872076034546, + "rewards/correct_reward_func/std": 0.12481305748224258, + "step": 27 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2237.0, + "completions/max_terminated_length": 2237.0, + "completions/mean_length": 1489.3214111328125, + "completions/mean_terminated_length": 1489.3214111328125, + "completions/min_length": 939.0, + "completions/min_terminated_length": 939.0, + "epoch": 0.04361370716510903, + "grad_norm": 0.5792966485023499, + "kl": 0.006385253742337227, + "learning_rate": 1.989375e-06, + "loss": -0.0165, + "num_tokens": 3752922.0, + "reward": 1.353344440460205, + "reward_std": 0.1214955672621727, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.3771539032459259, + "rewards/correct_reward_func/std": 0.1341404765844345, + "step": 28 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2873.0, + "completions/max_terminated_length": 2873.0, + "completions/mean_length": 1522.3095703125, + "completions/mean_terminated_length": 1522.3095703125, + "completions/min_length": 895.0, + "completions/min_terminated_length": 895.0, + "epoch": 0.045171339563862926, + "grad_norm": 0.566626250743866, + "kl": 0.0068822442553937435, + "learning_rate": 1.98875e-06, + "loss": 0.0097, + "num_tokens": 3886802.0, + "reward": 1.4724082946777344, + "reward_std": 0.11441156268119812, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4843129515647888, + "rewards/correct_reward_func/std": 0.1651531606912613, + "step": 29 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2806.0, + "completions/max_terminated_length": 2806.0, + "completions/mean_length": 1525.607177734375, + "completions/mean_terminated_length": 1525.607177734375, + "completions/min_length": 770.0, + "completions/min_terminated_length": 770.0, + "epoch": 0.04672897196261682, + "grad_norm": 0.5457088351249695, + "kl": 0.0072290110401809216, + "learning_rate": 1.9881249999999997e-06, + "loss": -0.0188, + "num_tokens": 4021133.0, + "reward": 1.4517107009887695, + "reward_std": 0.07869784533977509, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4517105519771576, + "rewards/correct_reward_func/std": 0.1555166095495224, + "step": 30 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2269.0, + "completions/max_terminated_length": 2269.0, + "completions/mean_length": 1493.107177734375, + "completions/mean_terminated_length": 1493.107177734375, + "completions/min_length": 962.0, + "completions/min_terminated_length": 962.0, + "epoch": 0.048286604361370715, + "grad_norm": 0.6338136792182922, + "kl": 0.00765396817587316, + "learning_rate": 1.9875e-06, + "loss": -0.0171, + "num_tokens": 4152536.0, + "reward": 1.4097148180007935, + "reward_std": 0.06910388171672821, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.409714937210083, + "rewards/correct_reward_func/std": 0.15830738842487335, + "step": 31 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2282.0, + "completions/max_terminated_length": 2282.0, + "completions/mean_length": 1465.21435546875, + "completions/mean_terminated_length": 1465.21435546875, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "epoch": 0.04984423676012461, + "grad_norm": 0.5526667833328247, + "kl": 0.008096857462078333, + "learning_rate": 1.986875e-06, + "loss": -0.0181, + "num_tokens": 4281500.0, + "reward": 1.4042376279830933, + "reward_std": 0.13532030582427979, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4280470907688141, + "rewards/correct_reward_func/std": 0.1270482838153839, + "step": 32 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2617.0, + "completions/max_terminated_length": 2617.0, + "completions/mean_length": 1565.357177734375, + "completions/mean_terminated_length": 1565.357177734375, + "completions/min_length": 1090.0, + "completions/min_terminated_length": 1090.0, + "epoch": 0.0514018691588785, + "grad_norm": 0.5518661737442017, + "kl": 0.008292545564472675, + "learning_rate": 1.98625e-06, + "loss": -0.0018, + "num_tokens": 4419188.0, + "reward": 1.4700257778167725, + "reward_std": 0.07613833993673325, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4700256884098053, + "rewards/correct_reward_func/std": 0.1587969958782196, + "step": 33 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2546.0, + "completions/max_terminated_length": 2546.0, + "completions/mean_length": 1508.3333740234375, + "completions/mean_terminated_length": 1508.3333740234375, + "completions/min_length": 985.0, + "completions/min_terminated_length": 985.0, + "epoch": 0.0529595015576324, + "grad_norm": 0.5799688696861267, + "kl": 0.008962879423052073, + "learning_rate": 1.9856249999999997e-06, + "loss": 0.0352, + "num_tokens": 4551942.0, + "reward": 1.3886348009109497, + "reward_std": 0.0952007845044136, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4005395472049713, + "rewards/correct_reward_func/std": 0.13069912791252136, + "step": 34 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2638.0, + "completions/mean_length": 1659.6429443359375, + "completions/mean_terminated_length": 1580.939697265625, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.05451713395638629, + "grad_norm": 0.5250554084777832, + "kl": 0.008917136583477259, + "learning_rate": 1.985e-06, + "loss": 0.0202, + "num_tokens": 4697496.0, + "reward": 1.4027281999588013, + "reward_std": 0.1092085987329483, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.41463297605514526, + "rewards/correct_reward_func/std": 0.15449127554893494, + "step": 35 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2422.0, + "completions/max_terminated_length": 2422.0, + "completions/mean_length": 1497.416748046875, + "completions/mean_terminated_length": 1497.416748046875, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "epoch": 0.056074766355140186, + "grad_norm": 0.5780891180038452, + "kl": 0.009273746516555548, + "learning_rate": 1.984375e-06, + "loss": -0.0039, + "num_tokens": 4829153.0, + "reward": 1.447800636291504, + "reward_std": 0.09562971442937851, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44780054688453674, + "rewards/correct_reward_func/std": 0.14727683365345, + "step": 36 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2437.0, + "completions/mean_length": 1602.2261962890625, + "completions/mean_terminated_length": 1522.831298828125, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.05763239875389408, + "grad_norm": 0.544447660446167, + "kl": 0.009292236994951963, + "learning_rate": 1.98375e-06, + "loss": 0.0476, + "num_tokens": 4969716.0, + "reward": 1.3862570524215698, + "reward_std": 0.13318565487861633, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.39816194772720337, + "rewards/correct_reward_func/std": 0.13715338706970215, + "step": 37 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2603.0, + "completions/max_terminated_length": 2603.0, + "completions/mean_length": 1596.1429443359375, + "completions/mean_terminated_length": 1596.1429443359375, + "completions/min_length": 1108.0, + "completions/min_terminated_length": 1108.0, + "epoch": 0.059190031152647975, + "grad_norm": 0.5399186015129089, + "kl": 0.009696871042251587, + "learning_rate": 1.9831249999999998e-06, + "loss": 0.0039, + "num_tokens": 5109858.0, + "reward": 1.450761318206787, + "reward_std": 0.07588593661785126, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4507613480091095, + "rewards/correct_reward_func/std": 0.14506648480892181, + "step": 38 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2544.0, + "completions/max_terminated_length": 2544.0, + "completions/mean_length": 1563.5833740234375, + "completions/mean_terminated_length": 1563.5833740234375, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "epoch": 0.06074766355140187, + "grad_norm": 0.5695660710334778, + "kl": 0.01048774877563119, + "learning_rate": 1.9824999999999997e-06, + "loss": -0.0184, + "num_tokens": 5247163.0, + "reward": 1.468957781791687, + "reward_std": 0.11620029807090759, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.48086267709732056, + "rewards/correct_reward_func/std": 0.15909142792224884, + "step": 39 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2598.0, + "completions/max_terminated_length": 2598.0, + "completions/mean_length": 1592.40478515625, + "completions/mean_terminated_length": 1592.40478515625, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "epoch": 0.06230529595015576, + "grad_norm": 0.5112860798835754, + "kl": 0.01074655307456851, + "learning_rate": 1.981875e-06, + "loss": 0.0015, + "num_tokens": 5387021.0, + "reward": 1.4456558227539062, + "reward_std": 0.11054416000843048, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.45756059885025024, + "rewards/correct_reward_func/std": 0.1503782570362091, + "step": 40 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2603.0, + "completions/mean_length": 1634.6190185546875, + "completions/mean_terminated_length": 1555.6143798828125, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "epoch": 0.06386292834890965, + "grad_norm": 0.5349351167678833, + "kl": 0.010826343204826117, + "learning_rate": 1.98125e-06, + "loss": 0.0725, + "num_tokens": 5530521.0, + "reward": 1.4252383708953857, + "reward_std": 0.11318045854568481, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.43714308738708496, + "rewards/correct_reward_func/std": 0.1361854374408722, + "step": 41 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2310.0, + "completions/max_terminated_length": 2310.0, + "completions/mean_length": 1527.047607421875, + "completions/mean_terminated_length": 1527.047607421875, + "completions/min_length": 43.0, + "completions/min_terminated_length": 43.0, + "epoch": 0.06542056074766354, + "grad_norm": 0.582492470741272, + "kl": 0.010992726311087608, + "learning_rate": 1.980625e-06, + "loss": -0.0331, + "num_tokens": 5664793.0, + "reward": 1.4851031303405762, + "reward_std": 0.1116347685456276, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4851030707359314, + "rewards/correct_reward_func/std": 0.18202589452266693, + "step": 42 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2985.0, + "completions/max_terminated_length": 2985.0, + "completions/mean_length": 1511.59521484375, + "completions/mean_terminated_length": 1511.59521484375, + "completions/min_length": 880.0, + "completions/min_terminated_length": 880.0, + "epoch": 0.06697819314641744, + "grad_norm": 0.5876683592796326, + "kl": 0.011393898166716099, + "learning_rate": 1.98e-06, + "loss": 0.0116, + "num_tokens": 5797743.0, + "reward": 1.3936251401901245, + "reward_std": 0.08762513846158981, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4055299162864685, + "rewards/correct_reward_func/std": 0.15156783163547516, + "step": 43 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2907.0, + "completions/max_terminated_length": 2907.0, + "completions/mean_length": 1573.21435546875, + "completions/mean_terminated_length": 1573.21435546875, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "epoch": 0.06853582554517133, + "grad_norm": 0.586552619934082, + "kl": 0.012315568514168262, + "learning_rate": 1.979375e-06, + "loss": 0.0067, + "num_tokens": 5935767.0, + "reward": 1.3731769323349, + "reward_std": 0.09234315901994705, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.3850816488265991, + "rewards/correct_reward_func/std": 0.1175212487578392, + "step": 44 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2610.0, + "completions/max_terminated_length": 2610.0, + "completions/mean_length": 1540.416748046875, + "completions/mean_terminated_length": 1540.416748046875, + "completions/min_length": 1049.0, + "completions/min_terminated_length": 1049.0, + "epoch": 0.07009345794392523, + "grad_norm": 0.5170222520828247, + "kl": 0.012232929933816195, + "learning_rate": 1.97875e-06, + "loss": -0.0046, + "num_tokens": 6071102.0, + "reward": 1.4366557598114014, + "reward_std": 0.05740538239479065, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4366556704044342, + "rewards/correct_reward_func/std": 0.12168268114328384, + "step": 45 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2226.0, + "completions/max_terminated_length": 2226.0, + "completions/mean_length": 1525.0, + "completions/mean_terminated_length": 1525.0, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "epoch": 0.07165109034267912, + "grad_norm": 0.5328289270401001, + "kl": 0.01250599604099989, + "learning_rate": 1.978125e-06, + "loss": 0.001, + "num_tokens": 6205328.0, + "reward": 1.4486056566238403, + "reward_std": 0.08073987811803818, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4605104327201843, + "rewards/correct_reward_func/std": 0.1921333372592926, + "step": 46 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2231.0, + "completions/max_terminated_length": 2231.0, + "completions/mean_length": 1511.3095703125, + "completions/mean_terminated_length": 1511.3095703125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.07320872274143302, + "grad_norm": 0.5911400318145752, + "kl": 0.012701177038252354, + "learning_rate": 1.9775e-06, + "loss": -0.0073, + "num_tokens": 6338398.0, + "reward": 1.4118802547454834, + "reward_std": 0.09744904190301895, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.42378488183021545, + "rewards/correct_reward_func/std": 0.15915460884571075, + "step": 47 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2418.0, + "completions/max_terminated_length": 2418.0, + "completions/mean_length": 1445.1785888671875, + "completions/mean_terminated_length": 1445.1785888671875, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "epoch": 0.07476635514018691, + "grad_norm": 0.5825939774513245, + "kl": 0.014202028047293425, + "learning_rate": 1.976875e-06, + "loss": 0.0291, + "num_tokens": 6465733.0, + "reward": 1.4184983968734741, + "reward_std": 0.07337880879640579, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.41849830746650696, + "rewards/correct_reward_func/std": 0.12052969634532928, + "step": 48 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2239.0, + "completions/max_terminated_length": 2239.0, + "completions/mean_length": 1505.0595703125, + "completions/mean_terminated_length": 1505.0595703125, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "epoch": 0.0763239875389408, + "grad_norm": 0.5948581099510193, + "kl": 0.013798453379422426, + "learning_rate": 1.97625e-06, + "loss": -0.0075, + "num_tokens": 6598212.0, + "reward": 1.4806807041168213, + "reward_std": 0.07690379023551941, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48068052530288696, + "rewards/correct_reward_func/std": 0.208627387881279, + "step": 49 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2220.0, + "completions/mean_length": 1519.84521484375, + "completions/mean_terminated_length": 1439.457763671875, + "completions/min_length": 750.0, + "completions/min_terminated_length": 750.0, + "epoch": 0.0778816199376947, + "grad_norm": 0.577416181564331, + "kl": 0.013814115896821022, + "learning_rate": 1.975625e-06, + "loss": 0.0698, + "num_tokens": 6731633.0, + "reward": 1.3718026876449585, + "reward_std": 0.08667115122079849, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.37180256843566895, + "rewards/correct_reward_func/std": 0.1464298814535141, + "step": 50 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2822.0, + "completions/max_terminated_length": 2822.0, + "completions/mean_length": 1481.9881591796875, + "completions/mean_terminated_length": 1481.9881591796875, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "epoch": 0.0794392523364486, + "grad_norm": 0.5731110572814941, + "kl": 0.014519122894853354, + "learning_rate": 1.975e-06, + "loss": 0.0268, + "num_tokens": 6861940.0, + "reward": 1.4318668842315674, + "reward_std": 0.10813824832439423, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.443771630525589, + "rewards/correct_reward_func/std": 0.155172199010849, + "step": 51 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2002.0, + "completions/max_terminated_length": 2002.0, + "completions/mean_length": 1394.15478515625, + "completions/mean_terminated_length": 1394.15478515625, + "completions/min_length": 670.0, + "completions/min_terminated_length": 670.0, + "epoch": 0.08099688473520249, + "grad_norm": 0.6160910129547119, + "kl": 0.01515409117564559, + "learning_rate": 1.974375e-06, + "loss": -0.0362, + "num_tokens": 6984959.0, + "reward": 1.4249346256256104, + "reward_std": 0.06116212159395218, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.424934446811676, + "rewards/correct_reward_func/std": 0.15084582567214966, + "step": 52 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2089.0, + "completions/max_terminated_length": 2089.0, + "completions/mean_length": 1428.0238037109375, + "completions/mean_terminated_length": 1428.0238037109375, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "epoch": 0.08255451713395638, + "grad_norm": 0.6062555909156799, + "kl": 0.015089603140950203, + "learning_rate": 1.97375e-06, + "loss": 0.0005, + "num_tokens": 7110697.0, + "reward": 1.427535891532898, + "reward_std": 0.11901802569627762, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4394405484199524, + "rewards/correct_reward_func/std": 0.17434334754943848, + "step": 53 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2193.0, + "completions/mean_length": 1549.46435546875, + "completions/mean_terminated_length": 1469.4337158203125, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "epoch": 0.08411214953271028, + "grad_norm": 0.5520058274269104, + "kl": 0.014234152156859636, + "learning_rate": 1.973125e-06, + "loss": 0.0763, + "num_tokens": 7246990.0, + "reward": 1.5137581825256348, + "reward_std": 0.0792492926120758, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5137581825256348, + "rewards/correct_reward_func/std": 0.1610475480556488, + "step": 54 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1853.0, + "completions/max_terminated_length": 1853.0, + "completions/mean_length": 1293.4405517578125, + "completions/mean_terminated_length": 1293.4405517578125, + "completions/min_length": 20.0, + "completions/min_terminated_length": 20.0, + "epoch": 0.08566978193146417, + "grad_norm": 0.605660617351532, + "kl": 0.01542581431567669, + "learning_rate": 1.9724999999999997e-06, + "loss": -0.038, + "num_tokens": 7361321.0, + "reward": 1.4857640266418457, + "reward_std": 0.10166757553815842, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4857640564441681, + "rewards/correct_reward_func/std": 0.16004827618598938, + "step": 55 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2166.0, + "completions/max_terminated_length": 2166.0, + "completions/mean_length": 1419.96435546875, + "completions/mean_terminated_length": 1419.96435546875, + "completions/min_length": 905.0, + "completions/min_terminated_length": 905.0, + "epoch": 0.08722741433021806, + "grad_norm": 0.5911623239517212, + "kl": 0.015617348719388247, + "learning_rate": 1.971875e-06, + "loss": 0.0036, + "num_tokens": 7486292.0, + "reward": 1.376123309135437, + "reward_std": 0.09990442544221878, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.38802799582481384, + "rewards/correct_reward_func/std": 0.12811584770679474, + "step": 56 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2756.0, + "completions/max_terminated_length": 2756.0, + "completions/mean_length": 1440.84521484375, + "completions/mean_terminated_length": 1440.84521484375, + "completions/min_length": 683.0, + "completions/min_terminated_length": 683.0, + "epoch": 0.08878504672897196, + "grad_norm": 0.5922889113426208, + "kl": 0.016300208866596222, + "learning_rate": 1.97125e-06, + "loss": 0.0113, + "num_tokens": 7613335.0, + "reward": 1.4130823612213135, + "reward_std": 0.1000562384724617, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.42498698830604553, + "rewards/correct_reward_func/std": 0.11995380371809006, + "step": 57 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2026.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1405.3214111328125, + "completions/mean_terminated_length": 1405.3214111328125, + "completions/min_length": 637.0, + "completions/min_terminated_length": 637.0, + "epoch": 0.09034267912772585, + "grad_norm": 0.5695552825927734, + "kl": 0.015375382732599974, + "learning_rate": 1.970625e-06, + "loss": 0.0023, + "num_tokens": 7737406.0, + "reward": 1.4474685192108154, + "reward_std": 0.12695710361003876, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4712778627872467, + "rewards/correct_reward_func/std": 0.16331063210964203, + "step": 58 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6591.0, + "completions/max_terminated_length": 6591.0, + "completions/mean_length": 1408.65478515625, + "completions/mean_terminated_length": 1408.65478515625, + "completions/min_length": 21.0, + "completions/min_terminated_length": 21.0, + "epoch": 0.09190031152647975, + "grad_norm": 0.5550790429115295, + "kl": 0.015237292740494013, + "learning_rate": 1.9699999999999998e-06, + "loss": -0.0213, + "num_tokens": 7861643.0, + "reward": 1.4678882360458374, + "reward_std": 0.09509699046611786, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46788811683654785, + "rewards/correct_reward_func/std": 0.1579650342464447, + "step": 59 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2238.0, + "completions/max_terminated_length": 2238.0, + "completions/mean_length": 1376.84521484375, + "completions/mean_terminated_length": 1376.84521484375, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "epoch": 0.09345794392523364, + "grad_norm": 0.5696773529052734, + "kl": 0.01649821363389492, + "learning_rate": 1.969375e-06, + "loss": -0.0025, + "num_tokens": 7983274.0, + "reward": 1.4070838689804077, + "reward_std": 0.061898693442344666, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4070839285850525, + "rewards/correct_reward_func/std": 0.1115923598408699, + "step": 60 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2341.0, + "completions/max_terminated_length": 2341.0, + "completions/mean_length": 1368.8214111328125, + "completions/mean_terminated_length": 1368.8214111328125, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "epoch": 0.09501557632398754, + "grad_norm": 0.6040320992469788, + "kl": 0.017192344181239605, + "learning_rate": 1.96875e-06, + "loss": 0.0044, + "num_tokens": 8104279.0, + "reward": 1.503865361213684, + "reward_std": 0.10960246622562408, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5157700777053833, + "rewards/correct_reward_func/std": 0.17495499551296234, + "step": 61 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1887.0, + "completions/max_terminated_length": 1887.0, + "completions/mean_length": 1308.5, + "completions/mean_terminated_length": 1308.5, + "completions/min_length": 658.0, + "completions/min_terminated_length": 658.0, + "epoch": 0.09657320872274143, + "grad_norm": 0.6219011545181274, + "kl": 0.017216363921761513, + "learning_rate": 1.968125e-06, + "loss": 0.0128, + "num_tokens": 8219905.0, + "reward": 1.4492701292037964, + "reward_std": 0.0713193342089653, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4492699205875397, + "rewards/correct_reward_func/std": 0.179514080286026, + "step": 62 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1914.0, + "completions/max_terminated_length": 1914.0, + "completions/mean_length": 1346.261962890625, + "completions/mean_terminated_length": 1346.261962890625, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "epoch": 0.09813084112149532, + "grad_norm": 0.5801587104797363, + "kl": 0.01690333615988493, + "learning_rate": 1.9675e-06, + "loss": 0.0077, + "num_tokens": 8338913.0, + "reward": 1.456557273864746, + "reward_std": 0.11657059192657471, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46846190094947815, + "rewards/correct_reward_func/std": 0.1239844486117363, + "step": 63 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2429.0, + "completions/max_terminated_length": 2429.0, + "completions/mean_length": 1346.7381591796875, + "completions/mean_terminated_length": 1346.7381591796875, + "completions/min_length": 531.0, + "completions/min_terminated_length": 531.0, + "epoch": 0.09968847352024922, + "grad_norm": 0.6113296747207642, + "kl": 0.017923656851053238, + "learning_rate": 1.9668749999999997e-06, + "loss": -0.0187, + "num_tokens": 8458009.0, + "reward": 1.448889136314392, + "reward_std": 0.07096786797046661, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4488890469074249, + "rewards/correct_reward_func/std": 0.15353722870349884, + "step": 64 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2639.0, + "completions/max_terminated_length": 2639.0, + "completions/mean_length": 1402.4761962890625, + "completions/mean_terminated_length": 1402.4761962890625, + "completions/min_length": 874.0, + "completions/min_terminated_length": 874.0, + "epoch": 0.10124610591900311, + "grad_norm": 0.5536694526672363, + "kl": 0.017508030869066715, + "learning_rate": 1.96625e-06, + "loss": -0.0024, + "num_tokens": 8581955.0, + "reward": 1.4375591278076172, + "reward_std": 0.12363146990537643, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4613686203956604, + "rewards/correct_reward_func/std": 0.16242319345474243, + "step": 65 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2007.0, + "completions/max_terminated_length": 2007.0, + "completions/mean_length": 1408.5357666015625, + "completions/mean_terminated_length": 1408.5357666015625, + "completions/min_length": 745.0, + "completions/min_terminated_length": 745.0, + "epoch": 0.102803738317757, + "grad_norm": 0.5778936743736267, + "kl": 0.018688876181840897, + "learning_rate": 1.965625e-06, + "loss": 0.0091, + "num_tokens": 8706158.0, + "reward": 1.4179571866989136, + "reward_std": 0.08643031865358353, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.41795703768730164, + "rewards/correct_reward_func/std": 0.14004966616630554, + "step": 66 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2824.0, + "completions/max_terminated_length": 2824.0, + "completions/mean_length": 1323.7857666015625, + "completions/mean_terminated_length": 1323.7857666015625, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "epoch": 0.1043613707165109, + "grad_norm": 0.6043643355369568, + "kl": 0.01887867320328951, + "learning_rate": 1.965e-06, + "loss": -0.0489, + "num_tokens": 8823284.0, + "reward": 1.4494088888168335, + "reward_std": 0.1078440248966217, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4613136053085327, + "rewards/correct_reward_func/std": 0.16942912340164185, + "step": 67 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1987.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 1402.0714111328125, + "completions/mean_terminated_length": 1402.0714111328125, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "epoch": 0.1059190031152648, + "grad_norm": 0.5758241415023804, + "kl": 0.01884887833148241, + "learning_rate": 1.9643749999999997e-06, + "loss": 0.0126, + "num_tokens": 8947046.0, + "reward": 1.4188536405563354, + "reward_std": 0.12322477996349335, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4426631033420563, + "rewards/correct_reward_func/std": 0.137950137257576, + "step": 68 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1983.0, + "completions/max_terminated_length": 1983.0, + "completions/mean_length": 1305.75, + "completions/mean_terminated_length": 1305.75, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "epoch": 0.10747663551401869, + "grad_norm": 0.634208083152771, + "kl": 0.01928142551332712, + "learning_rate": 1.96375e-06, + "loss": 0.0068, + "num_tokens": 9062675.0, + "reward": 1.4428762197494507, + "reward_std": 0.0850701555609703, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44287601113319397, + "rewards/correct_reward_func/std": 0.13382165133953094, + "step": 69 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1879.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 1331.6190185546875, + "completions/mean_terminated_length": 1331.6190185546875, + "completions/min_length": 442.0, + "completions/min_terminated_length": 442.0, + "epoch": 0.10903426791277258, + "grad_norm": 0.5981489419937134, + "kl": 0.019094611518085003, + "learning_rate": 1.963125e-06, + "loss": 0.0136, + "num_tokens": 9180489.0, + "reward": 1.4353286027908325, + "reward_std": 0.1428486853837967, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669146299362183, + "rewards/correct_reward_func/mean": 0.47104281187057495, + "rewards/correct_reward_func/std": 0.14202405512332916, + "step": 70 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 1905.0, + "completions/mean_length": 1393.84521484375, + "completions/mean_terminated_length": 1311.939697265625, + "completions/min_length": 384.0, + "completions/min_terminated_length": 384.0, + "epoch": 0.11059190031152648, + "grad_norm": 0.6035619378089905, + "kl": 0.018923446536064148, + "learning_rate": 1.9625e-06, + "loss": 0.0341, + "num_tokens": 9303716.0, + "reward": 1.4664435386657715, + "reward_std": 0.09690098464488983, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4783483147621155, + "rewards/correct_reward_func/std": 0.16768568754196167, + "step": 71 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2748.0, + "completions/max_terminated_length": 2748.0, + "completions/mean_length": 1333.5833740234375, + "completions/mean_terminated_length": 1333.5833740234375, + "completions/min_length": 815.0, + "completions/min_terminated_length": 815.0, + "epoch": 0.11214953271028037, + "grad_norm": 0.6100907325744629, + "kl": 0.020233074203133583, + "learning_rate": 1.9618749999999997e-06, + "loss": 0.0087, + "num_tokens": 9421647.0, + "reward": 1.5223532915115356, + "reward_std": 0.09191029518842697, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5342578887939453, + "rewards/correct_reward_func/std": 0.14939941465854645, + "step": 72 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1924.0, + "completions/max_terminated_length": 1924.0, + "completions/mean_length": 1286.8333740234375, + "completions/mean_terminated_length": 1286.8333740234375, + "completions/min_length": 864.0, + "completions/min_terminated_length": 864.0, + "epoch": 0.11370716510903427, + "grad_norm": 0.599524736404419, + "kl": 0.020186283625662327, + "learning_rate": 1.9612499999999996e-06, + "loss": -0.0046, + "num_tokens": 9535795.0, + "reward": 1.5099772214889526, + "reward_std": 0.08711431175470352, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5099770426750183, + "rewards/correct_reward_func/std": 0.15553654730319977, + "step": 73 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2063.0, + "completions/max_terminated_length": 2063.0, + "completions/mean_length": 1436.34521484375, + "completions/mean_terminated_length": 1436.34521484375, + "completions/min_length": 1035.0, + "completions/min_terminated_length": 1035.0, + "epoch": 0.11526479750778816, + "grad_norm": 0.5551663041114807, + "kl": 0.019929789006710052, + "learning_rate": 1.960625e-06, + "loss": 0.0163, + "num_tokens": 9662448.0, + "reward": 1.4788012504577637, + "reward_std": 0.06518861651420593, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4788011312484741, + "rewards/correct_reward_func/std": 0.1376221776008606, + "step": 74 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1884.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 1261.25, + "completions/mean_terminated_length": 1261.25, + "completions/min_length": 748.0, + "completions/min_terminated_length": 748.0, + "epoch": 0.11682242990654206, + "grad_norm": 0.6235907673835754, + "kl": 0.020636904053390026, + "learning_rate": 1.96e-06, + "loss": 0.0244, + "num_tokens": 9774249.0, + "reward": 1.4924941062927246, + "reward_std": 0.11271940171718597, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.492494136095047, + "rewards/correct_reward_func/std": 0.1804288774728775, + "step": 75 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2336.0, + "completions/max_terminated_length": 2336.0, + "completions/mean_length": 1304.4405517578125, + "completions/mean_terminated_length": 1304.4405517578125, + "completions/min_length": 777.0, + "completions/min_terminated_length": 777.0, + "epoch": 0.11838006230529595, + "grad_norm": 0.5913470983505249, + "kl": 0.021659635938704014, + "learning_rate": 1.959375e-06, + "loss": -0.0261, + "num_tokens": 9889840.0, + "reward": 1.5048737525939941, + "reward_std": 0.07548126578330994, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5048737525939941, + "rewards/correct_reward_func/std": 0.15308529138565063, + "step": 76 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2124.0, + "completions/max_terminated_length": 2124.0, + "completions/mean_length": 1349.297607421875, + "completions/mean_terminated_length": 1349.297607421875, + "completions/min_length": 344.0, + "completions/min_terminated_length": 344.0, + "epoch": 0.11993769470404984, + "grad_norm": 0.6202085018157959, + "kl": 0.0223425030708313, + "learning_rate": 1.95875e-06, + "loss": -0.0086, + "num_tokens": 10009103.0, + "reward": 1.4445719718933105, + "reward_std": 0.07185468822717667, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44457200169563293, + "rewards/correct_reward_func/std": 0.134343683719635, + "step": 77 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2239.0, + "completions/max_terminated_length": 2239.0, + "completions/mean_length": 1348.916748046875, + "completions/mean_terminated_length": 1348.916748046875, + "completions/min_length": 868.0, + "completions/min_terminated_length": 868.0, + "epoch": 0.12149532710280374, + "grad_norm": 0.5950252413749695, + "kl": 0.022254208102822304, + "learning_rate": 1.958125e-06, + "loss": -0.0042, + "num_tokens": 10128292.0, + "reward": 1.423844575881958, + "reward_std": 0.1012512668967247, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.43574920296669006, + "rewards/correct_reward_func/std": 0.14782190322875977, + "step": 78 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2228.0, + "completions/max_terminated_length": 2228.0, + "completions/mean_length": 1300.7738037109375, + "completions/mean_terminated_length": 1300.7738037109375, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.12305295950155763, + "grad_norm": 0.6564156413078308, + "kl": 0.022449446842074394, + "learning_rate": 1.9575e-06, + "loss": -0.0301, + "num_tokens": 10243383.0, + "reward": 1.4560483694076538, + "reward_std": 0.09143143892288208, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45604828000068665, + "rewards/correct_reward_func/std": 0.18612980842590332, + "step": 79 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1842.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 1269.9761962890625, + "completions/mean_terminated_length": 1269.9761962890625, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "epoch": 0.12461059190031153, + "grad_norm": 0.5963668823242188, + "kl": 0.02302556298673153, + "learning_rate": 1.956875e-06, + "loss": 0.0111, + "num_tokens": 10355959.0, + "reward": 1.4893500804901123, + "reward_std": 0.060889869928359985, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48934999108314514, + "rewards/correct_reward_func/std": 0.17626559734344482, + "step": 80 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2109.0, + "completions/max_terminated_length": 2109.0, + "completions/mean_length": 1332.357177734375, + "completions/mean_terminated_length": 1332.357177734375, + "completions/min_length": 700.0, + "completions/min_terminated_length": 700.0, + "epoch": 0.1261682242990654, + "grad_norm": 0.579901397228241, + "kl": 0.023500431329011917, + "learning_rate": 1.95625e-06, + "loss": -0.0165, + "num_tokens": 10473763.0, + "reward": 1.4350974559783936, + "reward_std": 0.09430722892284393, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.435097336769104, + "rewards/correct_reward_func/std": 0.17246632277965546, + "step": 81 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2049.0, + "completions/max_terminated_length": 2049.0, + "completions/mean_length": 1330.65478515625, + "completions/mean_terminated_length": 1330.65478515625, + "completions/min_length": 908.0, + "completions/min_terminated_length": 908.0, + "epoch": 0.1277258566978193, + "grad_norm": 0.6202122569084167, + "kl": 0.02360576204955578, + "learning_rate": 1.955625e-06, + "loss": -0.0197, + "num_tokens": 10591580.0, + "reward": 1.4157038927078247, + "reward_std": 0.10765408724546432, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4276086390018463, + "rewards/correct_reward_func/std": 0.12850892543792725, + "step": 82 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 1902.0, + "completions/mean_length": 1427.7738037109375, + "completions/mean_terminated_length": 1346.277099609375, + "completions/min_length": 883.0, + "completions/min_terminated_length": 883.0, + "epoch": 0.1292834890965732, + "grad_norm": 0.6136038899421692, + "kl": 0.023545796051621437, + "learning_rate": 1.955e-06, + "loss": 0.0744, + "num_tokens": 10717513.0, + "reward": 1.4062758684158325, + "reward_std": 0.1044679582118988, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4181804656982422, + "rewards/correct_reward_func/std": 0.12793242931365967, + "step": 83 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1828.0, + "completions/max_terminated_length": 1828.0, + "completions/mean_length": 1328.46435546875, + "completions/mean_terminated_length": 1328.46435546875, + "completions/min_length": 757.0, + "completions/min_terminated_length": 757.0, + "epoch": 0.1308411214953271, + "grad_norm": 0.6154831051826477, + "kl": 0.024212509393692017, + "learning_rate": 1.954375e-06, + "loss": -0.0025, + "num_tokens": 10835026.0, + "reward": 1.4841961860656738, + "reward_std": 0.08680614829063416, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48419615626335144, + "rewards/correct_reward_func/std": 0.18122969567775726, + "step": 84 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2318.0, + "completions/max_terminated_length": 2318.0, + "completions/mean_length": 1370.1905517578125, + "completions/mean_terminated_length": 1370.1905517578125, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "epoch": 0.13239875389408098, + "grad_norm": 0.5834784507751465, + "kl": 0.02425501774996519, + "learning_rate": 1.95375e-06, + "loss": 0.0084, + "num_tokens": 10956050.0, + "reward": 1.4767500162124634, + "reward_std": 0.0894772931933403, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4767499566078186, + "rewards/correct_reward_func/std": 0.17486557364463806, + "step": 85 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2188.0, + "completions/max_terminated_length": 2188.0, + "completions/mean_length": 1371.547607421875, + "completions/mean_terminated_length": 1371.547607421875, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "epoch": 0.13395638629283488, + "grad_norm": 0.5802989602088928, + "kl": 0.024927244521677494, + "learning_rate": 1.953125e-06, + "loss": 0.015, + "num_tokens": 11077188.0, + "reward": 1.4462058544158936, + "reward_std": 0.07399098575115204, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4462057650089264, + "rewards/correct_reward_func/std": 0.12212073057889938, + "step": 86 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3011.0, + "completions/max_terminated_length": 3011.0, + "completions/mean_length": 1386.9405517578125, + "completions/mean_terminated_length": 1386.9405517578125, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "epoch": 0.13551401869158877, + "grad_norm": 0.5793676376342773, + "kl": 0.024698903784155846, + "learning_rate": 1.9525e-06, + "loss": -0.0095, + "num_tokens": 11199721.0, + "reward": 1.4632221460342407, + "reward_std": 0.11924762278795242, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.47512686252593994, + "rewards/correct_reward_func/std": 0.20697803795337677, + "step": 87 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2026.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1388.047607421875, + "completions/mean_terminated_length": 1388.047607421875, + "completions/min_length": 970.0, + "completions/min_terminated_length": 970.0, + "epoch": 0.13707165109034267, + "grad_norm": 0.5643482804298401, + "kl": 0.02437182515859604, + "learning_rate": 1.951875e-06, + "loss": -0.0021, + "num_tokens": 11322299.0, + "reward": 1.4515498876571655, + "reward_std": 0.08298921585083008, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45154979825019836, + "rewards/correct_reward_func/std": 0.13497759401798248, + "step": 88 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2292.0, + "completions/mean_length": 1484.107177734375, + "completions/mean_terminated_length": 1403.2890625, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "epoch": 0.13862928348909656, + "grad_norm": 0.5531883239746094, + "kl": 0.023836837150156498, + "learning_rate": 1.9512499999999997e-06, + "loss": 0.0771, + "num_tokens": 11453042.0, + "reward": 1.4721571207046509, + "reward_std": 0.07365047186613083, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4721570611000061, + "rewards/correct_reward_func/std": 0.17172464728355408, + "step": 89 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1920.0, + "completions/max_terminated_length": 1920.0, + "completions/mean_length": 1399.9881591796875, + "completions/mean_terminated_length": 1399.9881591796875, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "epoch": 0.14018691588785046, + "grad_norm": 0.5798895359039307, + "kl": 0.024346785619854927, + "learning_rate": 1.950625e-06, + "loss": 0.0128, + "num_tokens": 11576779.0, + "reward": 1.4762535095214844, + "reward_std": 0.07557668536901474, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4762535095214844, + "rewards/correct_reward_func/std": 0.11668115109205246, + "step": 90 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2492.0, + "completions/max_terminated_length": 2492.0, + "completions/mean_length": 1411.7261962890625, + "completions/mean_terminated_length": 1411.7261962890625, + "completions/min_length": 792.0, + "completions/min_terminated_length": 792.0, + "epoch": 0.14174454828660435, + "grad_norm": 0.5864601731300354, + "kl": 0.025230017490684986, + "learning_rate": 1.95e-06, + "loss": -0.0058, + "num_tokens": 11701376.0, + "reward": 1.43711256980896, + "reward_std": 0.06144386902451515, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4371124505996704, + "rewards/correct_reward_func/std": 0.11439383029937744, + "step": 91 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3613.0, + "completions/max_terminated_length": 3613.0, + "completions/mean_length": 1334.6190185546875, + "completions/mean_terminated_length": 1334.6190185546875, + "completions/min_length": 904.0, + "completions/min_terminated_length": 904.0, + "epoch": 0.14330218068535824, + "grad_norm": 0.607031524181366, + "kl": 0.026643778197467327, + "learning_rate": 1.949375e-06, + "loss": 0.0048, + "num_tokens": 11819346.0, + "reward": 1.4904909133911133, + "reward_std": 0.0712086409330368, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4904908537864685, + "rewards/correct_reward_func/std": 0.11422253400087357, + "step": 92 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2193.0, + "completions/max_terminated_length": 2193.0, + "completions/mean_length": 1322.666748046875, + "completions/mean_terminated_length": 1322.666748046875, + "completions/min_length": 887.0, + "completions/min_terminated_length": 887.0, + "epoch": 0.14485981308411214, + "grad_norm": 0.6401565074920654, + "kl": 0.024812299758195877, + "learning_rate": 1.9487499999999998e-06, + "loss": -0.0045, + "num_tokens": 11936330.0, + "reward": 1.3906474113464355, + "reward_std": 0.07590549439191818, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4025520086288452, + "rewards/correct_reward_func/std": 0.16384169459342957, + "step": 93 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3187.0, + "completions/max_terminated_length": 3187.0, + "completions/mean_length": 1393.202392578125, + "completions/mean_terminated_length": 1393.202392578125, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "epoch": 0.14641744548286603, + "grad_norm": 0.6122695803642273, + "kl": 0.025386733002960682, + "learning_rate": 1.948125e-06, + "loss": 0.0086, + "num_tokens": 12059407.0, + "reward": 1.562843680381775, + "reward_std": 0.11002606898546219, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5747482776641846, + "rewards/correct_reward_func/std": 0.1576448678970337, + "step": 94 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2248.0, + "completions/max_terminated_length": 2248.0, + "completions/mean_length": 1397.547607421875, + "completions/mean_terminated_length": 1397.547607421875, + "completions/min_length": 299.0, + "completions/min_terminated_length": 299.0, + "epoch": 0.14797507788161993, + "grad_norm": 0.620469868183136, + "kl": 0.02531202882528305, + "learning_rate": 1.9475e-06, + "loss": -0.0123, + "num_tokens": 12182867.0, + "reward": 1.3950475454330444, + "reward_std": 0.11134982109069824, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4069521725177765, + "rewards/correct_reward_func/std": 0.14087031781673431, + "step": 95 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2121.0, + "completions/max_terminated_length": 2121.0, + "completions/mean_length": 1391.297607421875, + "completions/mean_terminated_length": 1391.297607421875, + "completions/min_length": 782.0, + "completions/min_terminated_length": 782.0, + "epoch": 0.14953271028037382, + "grad_norm": 0.5656567215919495, + "kl": 0.025920305401086807, + "learning_rate": 1.946875e-06, + "loss": 0.0145, + "num_tokens": 12305718.0, + "reward": 1.447014570236206, + "reward_std": 0.08030132949352264, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44701454043388367, + "rewards/correct_reward_func/std": 0.14291325211524963, + "step": 96 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1768.0, + "completions/max_terminated_length": 1768.0, + "completions/mean_length": 1303.34521484375, + "completions/mean_terminated_length": 1303.34521484375, + "completions/min_length": 725.0, + "completions/min_terminated_length": 725.0, + "epoch": 0.15109034267912771, + "grad_norm": 0.6328920125961304, + "kl": 0.026815838180482388, + "learning_rate": 1.94625e-06, + "loss": 0.0115, + "num_tokens": 12421073.0, + "reward": 1.4320292472839355, + "reward_std": 0.07025571167469025, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4320293068885803, + "rewards/correct_reward_func/std": 0.14837507903575897, + "step": 97 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2359.0, + "completions/max_terminated_length": 2359.0, + "completions/mean_length": 1374.34521484375, + "completions/mean_terminated_length": 1374.34521484375, + "completions/min_length": 532.0, + "completions/min_terminated_length": 532.0, + "epoch": 0.1526479750778816, + "grad_norm": 0.564152181148529, + "kl": 0.025968145579099655, + "learning_rate": 1.9456249999999997e-06, + "loss": -0.0224, + "num_tokens": 12542284.0, + "reward": 1.3898595571517944, + "reward_std": 0.11127079278230667, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.41366904973983765, + "rewards/correct_reward_func/std": 0.18172919750213623, + "step": 98 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2000.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1444.761962890625, + "completions/mean_terminated_length": 1444.761962890625, + "completions/min_length": 925.0, + "completions/min_terminated_length": 925.0, + "epoch": 0.1542056074766355, + "grad_norm": 0.5919306874275208, + "kl": 0.02604432962834835, + "learning_rate": 1.945e-06, + "loss": 0.028, + "num_tokens": 12669842.0, + "reward": 1.4752211570739746, + "reward_std": 0.07153313606977463, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47522109746932983, + "rewards/correct_reward_func/std": 0.1358029991388321, + "step": 99 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2159.0, + "completions/max_terminated_length": 2159.0, + "completions/mean_length": 1375.2857666015625, + "completions/mean_terminated_length": 1375.2857666015625, + "completions/min_length": 656.0, + "completions/min_terminated_length": 656.0, + "epoch": 0.1557632398753894, + "grad_norm": 0.6110028624534607, + "kl": 0.02663259394466877, + "learning_rate": 1.944375e-06, + "loss": 0.0359, + "num_tokens": 12791516.0, + "reward": 1.4065624475479126, + "reward_std": 0.07423868775367737, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4065624475479126, + "rewards/correct_reward_func/std": 0.1365046501159668, + "step": 100 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2102.0, + "completions/max_terminated_length": 2102.0, + "completions/mean_length": 1338.75, + "completions/mean_terminated_length": 1338.75, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "epoch": 0.1573208722741433, + "grad_norm": 0.6527758240699768, + "kl": 0.026112915948033333, + "learning_rate": 1.94375e-06, + "loss": 0.0353, + "num_tokens": 12909851.0, + "reward": 1.4908164739608765, + "reward_std": 0.11687764525413513, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5027210712432861, + "rewards/correct_reward_func/std": 0.15153582394123077, + "step": 101 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2034.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 1339.357177734375, + "completions/mean_terminated_length": 1339.357177734375, + "completions/min_length": 820.0, + "completions/min_terminated_length": 820.0, + "epoch": 0.1588785046728972, + "grad_norm": 0.6052369475364685, + "kl": 0.026593846268951893, + "learning_rate": 1.9431249999999997e-06, + "loss": -0.0369, + "num_tokens": 13028333.0, + "reward": 1.443188190460205, + "reward_std": 0.06765951961278915, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4431880712509155, + "rewards/correct_reward_func/std": 0.12718868255615234, + "step": 102 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2281.0, + "completions/mean_length": 1414.84521484375, + "completions/mean_terminated_length": 1333.1927490234375, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.16043613707165108, + "grad_norm": 0.5841978788375854, + "kl": 0.027741556987166405, + "learning_rate": 1.9424999999999996e-06, + "loss": 0.0426, + "num_tokens": 13153132.0, + "reward": 1.458450198173523, + "reward_std": 0.10668490082025528, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4584501385688782, + "rewards/correct_reward_func/std": 0.17332585155963898, + "step": 103 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2362.0, + "completions/max_terminated_length": 2362.0, + "completions/mean_length": 1408.5238037109375, + "completions/mean_terminated_length": 1408.5238037109375, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.16199376947040497, + "grad_norm": 0.574649453163147, + "kl": 0.026361594907939434, + "learning_rate": 1.941875e-06, + "loss": 0.0063, + "num_tokens": 13277502.0, + "reward": 1.4935458898544312, + "reward_std": 0.06738085299730301, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.493545800447464, + "rewards/correct_reward_func/std": 0.17171715199947357, + "step": 104 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2340.0, + "completions/max_terminated_length": 2340.0, + "completions/mean_length": 1397.2261962890625, + "completions/mean_terminated_length": 1397.2261962890625, + "completions/min_length": 825.0, + "completions/min_terminated_length": 825.0, + "epoch": 0.16355140186915887, + "grad_norm": 0.6107763648033142, + "kl": 0.028758167289197445, + "learning_rate": 1.94125e-06, + "loss": -0.0053, + "num_tokens": 13401001.0, + "reward": 1.4977682828903198, + "reward_std": 0.07337197661399841, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49776825308799744, + "rewards/correct_reward_func/std": 0.16519995033740997, + "step": 105 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1723.0, + "completions/max_terminated_length": 1723.0, + "completions/mean_length": 1302.9405517578125, + "completions/mean_terminated_length": 1302.9405517578125, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "epoch": 0.16510903426791276, + "grad_norm": 0.5974235534667969, + "kl": 0.02774975076317787, + "learning_rate": 1.940625e-06, + "loss": 0.0079, + "num_tokens": 13516292.0, + "reward": 1.500797152519226, + "reward_std": 0.07562069594860077, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5007970333099365, + "rewards/correct_reward_func/std": 0.1385519951581955, + "step": 106 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2012.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1333.3214111328125, + "completions/mean_terminated_length": 1333.3214111328125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "epoch": 0.16666666666666666, + "grad_norm": 0.5899143218994141, + "kl": 0.02739392127841711, + "learning_rate": 1.94e-06, + "loss": -0.0042, + "num_tokens": 13634147.0, + "reward": 1.3661153316497803, + "reward_std": 0.1458723396062851, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669146299362183, + "rewards/correct_reward_func/mean": 0.4018295407295227, + "rewards/correct_reward_func/std": 0.14165017008781433, + "step": 107 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2314.0, + "completions/max_terminated_length": 2314.0, + "completions/mean_length": 1340.2261962890625, + "completions/mean_terminated_length": 1340.2261962890625, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "epoch": 0.16822429906542055, + "grad_norm": 0.6039218902587891, + "kl": 0.027482734993100166, + "learning_rate": 1.939375e-06, + "loss": 0.0001, + "num_tokens": 13752750.0, + "reward": 1.4691303968429565, + "reward_std": 0.07967161387205124, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.469130277633667, + "rewards/correct_reward_func/std": 0.17357668280601501, + "step": 108 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2010.0, + "completions/max_terminated_length": 2010.0, + "completions/mean_length": 1375.047607421875, + "completions/mean_terminated_length": 1375.047607421875, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "epoch": 0.16978193146417445, + "grad_norm": 0.575854480266571, + "kl": 0.029047698713839054, + "learning_rate": 1.93875e-06, + "loss": -0.0041, + "num_tokens": 13874344.0, + "reward": 1.4227412939071655, + "reward_std": 0.07607690989971161, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.42274120450019836, + "rewards/correct_reward_func/std": 0.12703333795070648, + "step": 109 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2687.0, + "completions/mean_length": 1464.4285888671875, + "completions/mean_terminated_length": 1383.3734130859375, + "completions/min_length": 958.0, + "completions/min_terminated_length": 958.0, + "epoch": 0.17133956386292834, + "grad_norm": 0.5789852738380432, + "kl": 0.02678022440522909, + "learning_rate": 1.938125e-06, + "loss": 0.0679, + "num_tokens": 14003362.0, + "reward": 1.4550813436508179, + "reward_std": 0.09208068251609802, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4550813138484955, + "rewards/correct_reward_func/std": 0.12678542733192444, + "step": 110 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1957.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1335.797607421875, + "completions/mean_terminated_length": 1335.797607421875, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "epoch": 0.17289719626168223, + "grad_norm": 0.5807675719261169, + "kl": 0.027965486980974674, + "learning_rate": 1.9375e-06, + "loss": 0.0215, + "num_tokens": 14121389.0, + "reward": 1.4275166988372803, + "reward_std": 0.09834519028663635, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.43942132592201233, + "rewards/correct_reward_func/std": 0.17403005063533783, + "step": 111 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2137.0, + "completions/max_terminated_length": 2137.0, + "completions/mean_length": 1329.2738037109375, + "completions/mean_terminated_length": 1329.2738037109375, + "completions/min_length": 883.0, + "completions/min_terminated_length": 883.0, + "epoch": 0.17445482866043613, + "grad_norm": 0.6370654702186584, + "kl": 0.03011655993759632, + "learning_rate": 1.936875e-06, + "loss": -0.0064, + "num_tokens": 14239240.0, + "reward": 1.4735430479049683, + "reward_std": 0.07655790448188782, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4735429883003235, + "rewards/correct_reward_func/std": 0.1377311646938324, + "step": 112 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1904.0, + "completions/max_terminated_length": 1904.0, + "completions/mean_length": 1350.702392578125, + "completions/mean_terminated_length": 1350.702392578125, + "completions/min_length": 766.0, + "completions/min_terminated_length": 766.0, + "epoch": 0.17601246105919002, + "grad_norm": 0.5633478164672852, + "kl": 0.027842647396028042, + "learning_rate": 1.93625e-06, + "loss": 0.0033, + "num_tokens": 14358717.0, + "reward": 1.4617105722427368, + "reward_std": 0.07503892481327057, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46171048283576965, + "rewards/correct_reward_func/std": 0.11019705981016159, + "step": 113 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1946.0, + "completions/max_terminated_length": 1946.0, + "completions/mean_length": 1320.6785888671875, + "completions/mean_terminated_length": 1320.6785888671875, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "epoch": 0.17757009345794392, + "grad_norm": 0.622870683670044, + "kl": 0.02942818123847246, + "learning_rate": 1.9356249999999998e-06, + "loss": -0.0117, + "num_tokens": 14475876.0, + "reward": 1.4551728963851929, + "reward_std": 0.07278000563383102, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4551727771759033, + "rewards/correct_reward_func/std": 0.12961725890636444, + "step": 114 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2867.0, + "completions/max_terminated_length": 2867.0, + "completions/mean_length": 1367.547607421875, + "completions/mean_terminated_length": 1367.547607421875, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "epoch": 0.1791277258566978, + "grad_norm": 0.5723932385444641, + "kl": 0.027969708666205406, + "learning_rate": 1.935e-06, + "loss": 0.0019, + "num_tokens": 14596690.0, + "reward": 1.424649715423584, + "reward_std": 0.10459105670452118, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4365543723106384, + "rewards/correct_reward_func/std": 0.13125936686992645, + "step": 115 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2112.0, + "completions/mean_length": 1496.8690185546875, + "completions/mean_terminated_length": 1416.2047119140625, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "epoch": 0.1806853582554517, + "grad_norm": 0.5445118546485901, + "kl": 0.027951962314546108, + "learning_rate": 1.934375e-06, + "loss": 0.0382, + "num_tokens": 14728457.0, + "reward": 1.4374322891235352, + "reward_std": 0.08895470947027206, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4374321401119232, + "rewards/correct_reward_func/std": 0.1454283893108368, + "step": 116 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2093.0, + "completions/max_terminated_length": 2093.0, + "completions/mean_length": 1406.547607421875, + "completions/mean_terminated_length": 1406.547607421875, + "completions/min_length": 698.0, + "completions/min_terminated_length": 698.0, + "epoch": 0.1822429906542056, + "grad_norm": 0.5892264246940613, + "kl": 0.02943518850952387, + "learning_rate": 1.93375e-06, + "loss": 0.0251, + "num_tokens": 14852511.0, + "reward": 1.4491535425186157, + "reward_std": 0.07768179476261139, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44915345311164856, + "rewards/correct_reward_func/std": 0.14331160485744476, + "step": 117 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2191.0, + "completions/max_terminated_length": 2191.0, + "completions/mean_length": 1454.1429443359375, + "completions/mean_terminated_length": 1454.1429443359375, + "completions/min_length": 991.0, + "completions/min_terminated_length": 991.0, + "epoch": 0.1838006230529595, + "grad_norm": 0.5593940615653992, + "kl": 0.028669409453868866, + "learning_rate": 1.933125e-06, + "loss": -0.0092, + "num_tokens": 14980683.0, + "reward": 1.4328858852386475, + "reward_std": 0.06446022540330887, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4328858554363251, + "rewards/correct_reward_func/std": 0.15252062678337097, + "step": 118 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1976.0, + "completions/max_terminated_length": 1976.0, + "completions/mean_length": 1330.416748046875, + "completions/mean_terminated_length": 1330.416748046875, + "completions/min_length": 564.0, + "completions/min_terminated_length": 564.0, + "epoch": 0.1853582554517134, + "grad_norm": 0.6204725503921509, + "kl": 0.031036019325256348, + "learning_rate": 1.9325e-06, + "loss": 0.0538, + "num_tokens": 15098204.0, + "reward": 1.4865161180496216, + "reward_std": 0.06775263696908951, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4865160584449768, + "rewards/correct_reward_func/std": 0.1286322921514511, + "step": 119 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2474.0, + "completions/max_terminated_length": 2474.0, + "completions/mean_length": 1421.0714111328125, + "completions/mean_terminated_length": 1421.0714111328125, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "epoch": 0.18691588785046728, + "grad_norm": 0.6097022294998169, + "kl": 0.028833536431193352, + "learning_rate": 1.931875e-06, + "loss": -0.0063, + "num_tokens": 15223664.0, + "reward": 1.4651082754135132, + "reward_std": 0.08360718935728073, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4770130515098572, + "rewards/correct_reward_func/std": 0.15723736584186554, + "step": 120 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2433.0, + "completions/max_terminated_length": 2433.0, + "completions/mean_length": 1469.916748046875, + "completions/mean_terminated_length": 1469.916748046875, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "epoch": 0.18847352024922118, + "grad_norm": 0.5351011157035828, + "kl": 0.028757021762430668, + "learning_rate": 1.93125e-06, + "loss": 0.0368, + "num_tokens": 15353281.0, + "reward": 1.4508754014968872, + "reward_std": 0.06538330763578415, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4508753716945648, + "rewards/correct_reward_func/std": 0.14440658688545227, + "step": 121 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2518.0, + "completions/max_terminated_length": 2518.0, + "completions/mean_length": 1432.6190185546875, + "completions/mean_terminated_length": 1432.6190185546875, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "epoch": 0.19003115264797507, + "grad_norm": 0.5838773846626282, + "kl": 0.030190047807991505, + "learning_rate": 1.930625e-06, + "loss": 0.0004, + "num_tokens": 15479627.0, + "reward": 1.5679670572280884, + "reward_std": 0.08373278379440308, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5679671168327332, + "rewards/correct_reward_func/std": 0.17479771375656128, + "step": 122 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2098.0, + "completions/max_terminated_length": 2098.0, + "completions/mean_length": 1383.1905517578125, + "completions/mean_terminated_length": 1383.1905517578125, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "epoch": 0.19158878504672897, + "grad_norm": 0.603692889213562, + "kl": 0.03224192373454571, + "learning_rate": 1.9299999999999997e-06, + "loss": 0.0088, + "num_tokens": 15601791.0, + "reward": 1.4391270875930786, + "reward_std": 0.06994114071130753, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4391269087791443, + "rewards/correct_reward_func/std": 0.14909610152244568, + "step": 123 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2679.0, + "completions/max_terminated_length": 2679.0, + "completions/mean_length": 1490.2381591796875, + "completions/mean_terminated_length": 1490.2381591796875, + "completions/min_length": 1037.0, + "completions/min_terminated_length": 1037.0, + "epoch": 0.19314641744548286, + "grad_norm": 0.5941579937934875, + "kl": 0.02839325089007616, + "learning_rate": 1.929375e-06, + "loss": 0.0225, + "num_tokens": 15733049.0, + "reward": 1.415550947189331, + "reward_std": 0.06161380559206009, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.41555076837539673, + "rewards/correct_reward_func/std": 0.10922452807426453, + "step": 124 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2151.0, + "completions/max_terminated_length": 2151.0, + "completions/mean_length": 1322.2857666015625, + "completions/mean_terminated_length": 1322.2857666015625, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "epoch": 0.19470404984423675, + "grad_norm": 0.5985201001167297, + "kl": 0.029462194070219994, + "learning_rate": 1.92875e-06, + "loss": -0.0175, + "num_tokens": 15849953.0, + "reward": 1.4787646532058716, + "reward_std": 0.09507114440202713, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47876468300819397, + "rewards/correct_reward_func/std": 0.1848842203617096, + "step": 125 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1882.0, + "completions/max_terminated_length": 1882.0, + "completions/mean_length": 1334.8333740234375, + "completions/mean_terminated_length": 1334.8333740234375, + "completions/min_length": 500.0, + "completions/min_terminated_length": 500.0, + "epoch": 0.19626168224299065, + "grad_norm": 0.6323754191398621, + "kl": 0.02924549486488104, + "learning_rate": 1.928125e-06, + "loss": -0.0014, + "num_tokens": 15967953.0, + "reward": 1.520105242729187, + "reward_std": 0.07988641411066055, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5201051831245422, + "rewards/correct_reward_func/std": 0.16170603036880493, + "step": 126 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2265.0, + "completions/max_terminated_length": 2265.0, + "completions/mean_length": 1414.9881591796875, + "completions/mean_terminated_length": 1414.9881591796875, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "epoch": 0.19781931464174454, + "grad_norm": 0.5964637994766235, + "kl": 0.029463034123182297, + "learning_rate": 1.9274999999999998e-06, + "loss": 0.0118, + "num_tokens": 16092890.0, + "reward": 1.4794553518295288, + "reward_std": 0.06303998827934265, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47945523262023926, + "rewards/correct_reward_func/std": 0.11690139025449753, + "step": 127 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2203.0, + "completions/mean_length": 1490.8214111328125, + "completions/mean_terminated_length": 1410.084228515625, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "epoch": 0.19937694704049844, + "grad_norm": 0.5852746963500977, + "kl": 0.029941502027213573, + "learning_rate": 1.9268749999999997e-06, + "loss": 0.0568, + "num_tokens": 16224011.0, + "reward": 1.4570320844650269, + "reward_std": 0.13305586576461792, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4808415472507477, + "rewards/correct_reward_func/std": 0.15189340710639954, + "step": 128 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2393.0, + "completions/max_terminated_length": 2393.0, + "completions/mean_length": 1419.9285888671875, + "completions/mean_terminated_length": 1419.9285888671875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "epoch": 0.20093457943925233, + "grad_norm": 0.6678198575973511, + "kl": 0.02869417704641819, + "learning_rate": 1.92625e-06, + "loss": -0.0245, + "num_tokens": 16349273.0, + "reward": 1.4819056987762451, + "reward_std": 0.0900546982884407, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4819056987762451, + "rewards/correct_reward_func/std": 0.13743773102760315, + "step": 129 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2579.0, + "completions/max_terminated_length": 2579.0, + "completions/mean_length": 1410.511962890625, + "completions/mean_terminated_length": 1410.511962890625, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "epoch": 0.20249221183800623, + "grad_norm": 0.5829043984413147, + "kl": 0.029658552259206772, + "learning_rate": 1.925625e-06, + "loss": 0.0304, + "num_tokens": 16473606.0, + "reward": 1.4438934326171875, + "reward_std": 0.08044224977493286, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4438934326171875, + "rewards/correct_reward_func/std": 0.15281730890274048, + "step": 130 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2306.0, + "completions/max_terminated_length": 2306.0, + "completions/mean_length": 1419.6190185546875, + "completions/mean_terminated_length": 1419.6190185546875, + "completions/min_length": 876.0, + "completions/min_terminated_length": 876.0, + "epoch": 0.20404984423676012, + "grad_norm": 0.6060424447059631, + "kl": 0.029810849577188492, + "learning_rate": 1.9249999999999998e-06, + "loss": 0.0186, + "num_tokens": 16598698.0, + "reward": 1.4231759309768677, + "reward_std": 0.13214969635009766, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669144809246063, + "rewards/correct_reward_func/mean": 0.4588901996612549, + "rewards/correct_reward_func/std": 0.17215143144130707, + "step": 131 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2134.0, + "completions/mean_length": 1587.047607421875, + "completions/mean_terminated_length": 1425.951171875, + "completions/min_length": 791.0, + "completions/min_terminated_length": 791.0, + "epoch": 0.205607476635514, + "grad_norm": 0.5848340392112732, + "kl": 0.02779593039304018, + "learning_rate": 1.9243749999999997e-06, + "loss": 0.1231, + "num_tokens": 16738082.0, + "reward": 1.418549656867981, + "reward_std": 0.09781080484390259, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.41854962706565857, + "rewards/correct_reward_func/std": 0.15713582932949066, + "step": 132 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2101.0, + "completions/max_terminated_length": 2101.0, + "completions/mean_length": 1363.107177734375, + "completions/mean_terminated_length": 1363.107177734375, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "epoch": 0.2071651090342679, + "grad_norm": 0.6095062494277954, + "kl": 0.02982013951987028, + "learning_rate": 1.92375e-06, + "loss": -0.0032, + "num_tokens": 16858487.0, + "reward": 1.4797476530075073, + "reward_std": 0.08453521132469177, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47974759340286255, + "rewards/correct_reward_func/std": 0.13973869383335114, + "step": 133 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2277.0, + "completions/max_terminated_length": 2277.0, + "completions/mean_length": 1384.3095703125, + "completions/mean_terminated_length": 1384.3095703125, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "epoch": 0.2087227414330218, + "grad_norm": 0.5872117280960083, + "kl": 0.03033105470240116, + "learning_rate": 1.923125e-06, + "loss": -0.0099, + "num_tokens": 16980787.0, + "reward": 1.4834345579147339, + "reward_std": 0.0603872612118721, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48343440890312195, + "rewards/correct_reward_func/std": 0.16034552454948425, + "step": 134 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2117.0, + "completions/max_terminated_length": 2117.0, + "completions/mean_length": 1370.40478515625, + "completions/mean_terminated_length": 1370.40478515625, + "completions/min_length": 822.0, + "completions/min_terminated_length": 822.0, + "epoch": 0.2102803738317757, + "grad_norm": 0.6103830337524414, + "kl": 0.030320947989821434, + "learning_rate": 1.9225e-06, + "loss": 0.0061, + "num_tokens": 17101811.0, + "reward": 1.4273531436920166, + "reward_std": 0.09265647828578949, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4392578601837158, + "rewards/correct_reward_func/std": 0.13740864396095276, + "step": 135 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1993.0, + "completions/max_terminated_length": 1993.0, + "completions/mean_length": 1434.46435546875, + "completions/mean_terminated_length": 1434.46435546875, + "completions/min_length": 812.0, + "completions/min_terminated_length": 812.0, + "epoch": 0.2118380062305296, + "grad_norm": 0.603561520576477, + "kl": 0.02980469260364771, + "learning_rate": 1.9218749999999997e-06, + "loss": -0.0064, + "num_tokens": 17228618.0, + "reward": 1.4576891660690308, + "reward_std": 0.05491868779063225, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45768895745277405, + "rewards/correct_reward_func/std": 0.14544495940208435, + "step": 136 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2719.0, + "completions/max_terminated_length": 2719.0, + "completions/mean_length": 1493.416748046875, + "completions/mean_terminated_length": 1493.416748046875, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "epoch": 0.21339563862928349, + "grad_norm": 0.5915752053260803, + "kl": 0.028933603316545486, + "learning_rate": 1.9212499999999996e-06, + "loss": -0.0056, + "num_tokens": 17360071.0, + "reward": 1.5237936973571777, + "reward_std": 0.07290388643741608, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5237936973571777, + "rewards/correct_reward_func/std": 0.14304442703723907, + "step": 137 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2306.0, + "completions/max_terminated_length": 2306.0, + "completions/mean_length": 1456.1785888671875, + "completions/mean_terminated_length": 1456.1785888671875, + "completions/min_length": 971.0, + "completions/min_terminated_length": 971.0, + "epoch": 0.21495327102803738, + "grad_norm": 0.5867950916290283, + "kl": 0.030623883940279484, + "learning_rate": 1.920625e-06, + "loss": 0.0369, + "num_tokens": 17488402.0, + "reward": 1.424317717552185, + "reward_std": 0.09510175883769989, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4362224042415619, + "rewards/correct_reward_func/std": 0.15234197676181793, + "step": 138 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7913.0, + "completions/max_terminated_length": 7913.0, + "completions/mean_length": 1533.6190185546875, + "completions/mean_terminated_length": 1533.6190185546875, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "epoch": 0.21651090342679127, + "grad_norm": 0.5414514541625977, + "kl": 0.02811363060027361, + "learning_rate": 1.92e-06, + "loss": 0.0618, + "num_tokens": 17623412.0, + "reward": 1.491133689880371, + "reward_std": 0.06729433685541153, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49113360047340393, + "rewards/correct_reward_func/std": 0.17636097967624664, + "step": 139 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 1416.011962890625, + "completions/mean_terminated_length": 1416.011962890625, + "completions/min_length": 852.0, + "completions/min_terminated_length": 852.0, + "epoch": 0.21806853582554517, + "grad_norm": 0.5571795701980591, + "kl": 0.03138226270675659, + "learning_rate": 1.919375e-06, + "loss": 0.0457, + "num_tokens": 17748435.0, + "reward": 1.4378496408462524, + "reward_std": 0.07887466251850128, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4378494918346405, + "rewards/correct_reward_func/std": 0.10594429075717926, + "step": 140 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2041.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1325.5833740234375, + "completions/mean_terminated_length": 1325.5833740234375, + "completions/min_length": 374.0, + "completions/min_terminated_length": 374.0, + "epoch": 0.21962616822429906, + "grad_norm": 0.5585833191871643, + "kl": 0.03183058649301529, + "learning_rate": 1.91875e-06, + "loss": -0.0038, + "num_tokens": 17865712.0, + "reward": 1.5206776857376099, + "reward_std": 0.089789979159832, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5206776261329651, + "rewards/correct_reward_func/std": 0.18472737073898315, + "step": 141 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1899.0, + "completions/max_terminated_length": 1899.0, + "completions/mean_length": 1388.392822265625, + "completions/mean_terminated_length": 1388.392822265625, + "completions/min_length": 950.0, + "completions/min_terminated_length": 950.0, + "epoch": 0.22118380062305296, + "grad_norm": 0.5522568225860596, + "kl": 0.030762070789933205, + "learning_rate": 1.918125e-06, + "loss": 0.0044, + "num_tokens": 17988241.0, + "reward": 1.4582120180130005, + "reward_std": 0.13537071645259857, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669144809246063, + "rewards/correct_reward_func/mean": 0.49392637610435486, + "rewards/correct_reward_func/std": 0.1673257201910019, + "step": 142 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2187.0, + "completions/max_terminated_length": 2187.0, + "completions/mean_length": 1382.2857666015625, + "completions/mean_terminated_length": 1382.2857666015625, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "epoch": 0.22274143302180685, + "grad_norm": 0.6352323889732361, + "kl": 0.03129299636930227, + "learning_rate": 1.9175e-06, + "loss": 0.0264, + "num_tokens": 18110221.0, + "reward": 1.4690262079238892, + "reward_std": 0.0818016454577446, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.469026118516922, + "rewards/correct_reward_func/std": 0.152619868516922, + "step": 143 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2218.0, + "completions/max_terminated_length": 2218.0, + "completions/mean_length": 1394.166748046875, + "completions/mean_terminated_length": 1394.166748046875, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "epoch": 0.22429906542056074, + "grad_norm": 0.63814777135849, + "kl": 0.03126653470098972, + "learning_rate": 1.916875e-06, + "loss": 0.0072, + "num_tokens": 18233295.0, + "reward": 1.5044426918029785, + "reward_std": 0.10564389079809189, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5163474082946777, + "rewards/correct_reward_func/std": 0.15176692605018616, + "step": 144 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1836.0, + "completions/max_terminated_length": 1836.0, + "completions/mean_length": 1409.261962890625, + "completions/mean_terminated_length": 1409.261962890625, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "epoch": 0.22585669781931464, + "grad_norm": 0.6800863146781921, + "kl": 0.03138407226651907, + "learning_rate": 1.91625e-06, + "loss": 0.0101, + "num_tokens": 18357601.0, + "reward": 1.3754723072052002, + "reward_std": 0.12025143206119537, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.3992818295955658, + "rewards/correct_reward_func/std": 0.1352054476737976, + "step": 145 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1912.0, + "completions/max_terminated_length": 1912.0, + "completions/mean_length": 1371.5833740234375, + "completions/mean_terminated_length": 1371.5833740234375, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "epoch": 0.22741433021806853, + "grad_norm": 0.5346211791038513, + "kl": 0.03179503232240677, + "learning_rate": 1.915625e-06, + "loss": 0.0068, + "num_tokens": 18478958.0, + "reward": 1.4643080234527588, + "reward_std": 0.05763059854507446, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46430787444114685, + "rewards/correct_reward_func/std": 0.13611404597759247, + "step": 146 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2153.0, + "completions/max_terminated_length": 2153.0, + "completions/mean_length": 1347.75, + "completions/mean_terminated_length": 1347.75, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "epoch": 0.22897196261682243, + "grad_norm": 0.6036585569381714, + "kl": 0.032050169073045254, + "learning_rate": 1.915e-06, + "loss": 0.0138, + "num_tokens": 18598259.0, + "reward": 1.5019899606704712, + "reward_std": 0.07086333632469177, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5019899010658264, + "rewards/correct_reward_func/std": 0.12254533916711807, + "step": 147 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1863.0, + "completions/max_terminated_length": 1863.0, + "completions/mean_length": 1349.4761962890625, + "completions/mean_terminated_length": 1349.4761962890625, + "completions/min_length": 539.0, + "completions/min_terminated_length": 539.0, + "epoch": 0.23052959501557632, + "grad_norm": 0.5723428130149841, + "kl": 0.033308178186416626, + "learning_rate": 1.9143749999999998e-06, + "loss": -0.0342, + "num_tokens": 18717507.0, + "reward": 1.439958095550537, + "reward_std": 0.09097757190465927, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.45186278223991394, + "rewards/correct_reward_func/std": 0.15231125056743622, + "step": 148 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1705.0, + "completions/max_terminated_length": 1705.0, + "completions/mean_length": 1306.0714111328125, + "completions/mean_terminated_length": 1306.0714111328125, + "completions/min_length": 493.0, + "completions/min_terminated_length": 493.0, + "epoch": 0.23208722741433022, + "grad_norm": 0.6826386451721191, + "kl": 0.040258824825286865, + "learning_rate": 1.91375e-06, + "loss": -0.0055, + "num_tokens": 18833091.0, + "reward": 1.432396411895752, + "reward_std": 0.0788629949092865, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4323963522911072, + "rewards/correct_reward_func/std": 0.1607416421175003, + "step": 149 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1910.0, + "completions/max_terminated_length": 1910.0, + "completions/mean_length": 1339.7857666015625, + "completions/mean_terminated_length": 1339.7857666015625, + "completions/min_length": 815.0, + "completions/min_terminated_length": 815.0, + "epoch": 0.2336448598130841, + "grad_norm": 0.6238492131233215, + "kl": 0.033289359882473946, + "learning_rate": 1.913125e-06, + "loss": -0.0105, + "num_tokens": 18951591.0, + "reward": 1.4412423372268677, + "reward_std": 0.06025463342666626, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44124215841293335, + "rewards/correct_reward_func/std": 0.1178692877292633, + "step": 150 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1867.0, + "completions/max_terminated_length": 1867.0, + "completions/mean_length": 1272.0357666015625, + "completions/mean_terminated_length": 1272.0357666015625, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "epoch": 0.235202492211838, + "grad_norm": 0.6190460920333862, + "kl": 0.034545375034213066, + "learning_rate": 1.9125e-06, + "loss": -0.0009, + "num_tokens": 19064406.0, + "reward": 1.402614951133728, + "reward_std": 0.054387416690588, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.40261486172676086, + "rewards/correct_reward_func/std": 0.13192394375801086, + "step": 151 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6207.0, + "completions/max_terminated_length": 6207.0, + "completions/mean_length": 1383.2738037109375, + "completions/mean_terminated_length": 1383.2738037109375, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "epoch": 0.2367601246105919, + "grad_norm": 0.6016775369644165, + "kl": 0.03410913795232773, + "learning_rate": 1.911875e-06, + "loss": -0.0575, + "num_tokens": 19186433.0, + "reward": 1.465145468711853, + "reward_std": 0.06573602557182312, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46514537930488586, + "rewards/correct_reward_func/std": 0.13837113976478577, + "step": 152 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1986.0, + "completions/max_terminated_length": 1986.0, + "completions/mean_length": 1315.6309814453125, + "completions/mean_terminated_length": 1315.6309814453125, + "completions/min_length": 607.0, + "completions/min_terminated_length": 607.0, + "epoch": 0.2383177570093458, + "grad_norm": 0.6019495725631714, + "kl": 0.03409886732697487, + "learning_rate": 1.9112499999999997e-06, + "loss": -0.0083, + "num_tokens": 19302892.0, + "reward": 1.4242923259735107, + "reward_std": 0.0667426809668541, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4242922365665436, + "rewards/correct_reward_func/std": 0.13553784787654877, + "step": 153 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2117.0, + "completions/max_terminated_length": 2117.0, + "completions/mean_length": 1342.0833740234375, + "completions/mean_terminated_length": 1342.0833740234375, + "completions/min_length": 238.0, + "completions/min_terminated_length": 238.0, + "epoch": 0.2398753894080997, + "grad_norm": 0.5771262049674988, + "kl": 0.0346537921577692, + "learning_rate": 1.910625e-06, + "loss": -0.0074, + "num_tokens": 19421651.0, + "reward": 1.4762444496154785, + "reward_std": 0.08208738267421722, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4762443006038666, + "rewards/correct_reward_func/std": 0.19542856514453888, + "step": 154 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2065.0, + "completions/max_terminated_length": 2065.0, + "completions/mean_length": 1312.6905517578125, + "completions/mean_terminated_length": 1312.6905517578125, + "completions/min_length": 753.0, + "completions/min_terminated_length": 753.0, + "epoch": 0.24143302180685358, + "grad_norm": 0.5773271918296814, + "kl": 0.03536880388855934, + "learning_rate": 1.91e-06, + "loss": -0.0149, + "num_tokens": 19537893.0, + "reward": 1.4299932718276978, + "reward_std": 0.06326950341463089, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.429993212223053, + "rewards/correct_reward_func/std": 0.15807007253170013, + "step": 155 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2368.0, + "completions/max_terminated_length": 2368.0, + "completions/mean_length": 1352.166748046875, + "completions/mean_terminated_length": 1352.166748046875, + "completions/min_length": 765.0, + "completions/min_terminated_length": 765.0, + "epoch": 0.24299065420560748, + "grad_norm": 0.6289856433868408, + "kl": 0.03449527733027935, + "learning_rate": 1.909375e-06, + "loss": -0.003, + "num_tokens": 19657355.0, + "reward": 1.4612387418746948, + "reward_std": 0.06969407945871353, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46123871207237244, + "rewards/correct_reward_func/std": 0.1171262189745903, + "step": 156 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1960.0, + "completions/max_terminated_length": 1960.0, + "completions/mean_length": 1367.142822265625, + "completions/mean_terminated_length": 1367.142822265625, + "completions/min_length": 612.0, + "completions/min_terminated_length": 612.0, + "epoch": 0.24454828660436137, + "grad_norm": 0.5991291999816895, + "kl": 0.034693608060479164, + "learning_rate": 1.9087499999999997e-06, + "loss": 0.0271, + "num_tokens": 19778057.0, + "reward": 1.3796496391296387, + "reward_std": 0.056341852992773056, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.3796495795249939, + "rewards/correct_reward_func/std": 0.11140848696231842, + "step": 157 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2143.0, + "completions/max_terminated_length": 2143.0, + "completions/mean_length": 1404.4881591796875, + "completions/mean_terminated_length": 1404.4881591796875, + "completions/min_length": 936.0, + "completions/min_terminated_length": 936.0, + "epoch": 0.24610591900311526, + "grad_norm": 0.5495375990867615, + "kl": 0.03542102687060833, + "learning_rate": 1.908125e-06, + "loss": 0.0074, + "num_tokens": 19902070.0, + "reward": 1.47572660446167, + "reward_std": 0.10572995990514755, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.49953609704971313, + "rewards/correct_reward_func/std": 0.17438319325447083, + "step": 158 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2159.0, + "completions/mean_length": 1494.416748046875, + "completions/mean_terminated_length": 1413.7227783203125, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "epoch": 0.24766355140186916, + "grad_norm": 0.5821869373321533, + "kl": 0.03435787186026573, + "learning_rate": 1.9075e-06, + "loss": 0.0632, + "num_tokens": 20033679.0, + "reward": 1.4355047941207886, + "reward_std": 0.0911315381526947, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44740939140319824, + "rewards/correct_reward_func/std": 0.15956489741802216, + "step": 159 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2155.0, + "completions/max_terminated_length": 2155.0, + "completions/mean_length": 1374.7857666015625, + "completions/mean_terminated_length": 1374.7857666015625, + "completions/min_length": 933.0, + "completions/min_terminated_length": 933.0, + "epoch": 0.24922118380062305, + "grad_norm": 0.6089724898338318, + "kl": 0.033023279160261154, + "learning_rate": 1.906875e-06, + "loss": 0.0046, + "num_tokens": 20155203.0, + "reward": 1.4912819862365723, + "reward_std": 0.05513819307088852, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4912818670272827, + "rewards/correct_reward_func/std": 0.16794303059577942, + "step": 160 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2273.0, + "completions/max_terminated_length": 2273.0, + "completions/mean_length": 1431.2381591796875, + "completions/mean_terminated_length": 1431.2381591796875, + "completions/min_length": 807.0, + "completions/min_terminated_length": 807.0, + "epoch": 0.2507788161993769, + "grad_norm": 0.5501843690872192, + "kl": 0.03268396854400635, + "learning_rate": 1.90625e-06, + "loss": 0.0046, + "num_tokens": 20281463.0, + "reward": 1.4660686254501343, + "reward_std": 0.0724034234881401, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4660685062408447, + "rewards/correct_reward_func/std": 0.16601873934268951, + "step": 161 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2477.0, + "completions/mean_length": 1548.9881591796875, + "completions/mean_terminated_length": 1468.9517822265625, + "completions/min_length": 942.0, + "completions/min_terminated_length": 942.0, + "epoch": 0.2523364485981308, + "grad_norm": 0.5610131621360779, + "kl": 0.03170663956552744, + "learning_rate": 1.9056249999999999e-06, + "loss": 0.0445, + "num_tokens": 20417668.0, + "reward": 1.451736569404602, + "reward_std": 0.09086348861455917, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4517364501953125, + "rewards/correct_reward_func/std": 0.17963163554668427, + "step": 162 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2153.0, + "completions/mean_length": 1610.5, + "completions/mean_terminated_length": 1449.9755859375, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "epoch": 0.2538940809968847, + "grad_norm": 0.5172660946846008, + "kl": 0.030543990433216095, + "learning_rate": 1.905e-06, + "loss": 0.1107, + "num_tokens": 20558926.0, + "reward": 1.4814132452011108, + "reward_std": 0.09557101875543594, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4814131557941437, + "rewards/correct_reward_func/std": 0.1708441823720932, + "step": 163 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2249.0, + "completions/max_terminated_length": 2249.0, + "completions/mean_length": 1437.416748046875, + "completions/mean_terminated_length": 1437.416748046875, + "completions/min_length": 633.0, + "completions/min_terminated_length": 633.0, + "epoch": 0.2554517133956386, + "grad_norm": 0.5784984230995178, + "kl": 0.03393215127289295, + "learning_rate": 1.9043749999999999e-06, + "loss": 0.0018, + "num_tokens": 20685663.0, + "reward": 1.5089834928512573, + "reward_std": 0.08463006466627121, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5208882093429565, + "rewards/correct_reward_func/std": 0.1223825141787529, + "step": 164 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3265.0, + "completions/max_terminated_length": 3265.0, + "completions/mean_length": 1466.6190185546875, + "completions/mean_terminated_length": 1466.6190185546875, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.2570093457943925, + "grad_norm": 0.6114115715026855, + "kl": 0.03406968712806702, + "learning_rate": 1.90375e-06, + "loss": 0.0058, + "num_tokens": 20814859.0, + "reward": 1.4296975135803223, + "reward_std": 0.08567629754543304, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4296974539756775, + "rewards/correct_reward_func/std": 0.12944677472114563, + "step": 165 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6028.0, + "completions/max_terminated_length": 6028.0, + "completions/mean_length": 1430.15478515625, + "completions/mean_terminated_length": 1430.15478515625, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "epoch": 0.2585669781931464, + "grad_norm": 0.6096552014350891, + "kl": 0.03403060883283615, + "learning_rate": 1.9031249999999999e-06, + "loss": 0.0408, + "num_tokens": 20940830.0, + "reward": 1.4890365600585938, + "reward_std": 0.05935479328036308, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4890367090702057, + "rewards/correct_reward_func/std": 0.17837880551815033, + "step": 166 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2530.0, + "completions/mean_length": 1496.1785888671875, + "completions/mean_terminated_length": 1415.5059814453125, + "completions/min_length": 706.0, + "completions/min_terminated_length": 706.0, + "epoch": 0.2601246105919003, + "grad_norm": 0.5552809238433838, + "kl": 0.03266907203942537, + "learning_rate": 1.9025e-06, + "loss": 0.0631, + "num_tokens": 21072461.0, + "reward": 1.4831464290618896, + "reward_std": 0.07709907740354538, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48314639925956726, + "rewards/correct_reward_func/std": 0.1887180060148239, + "step": 167 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2289.0, + "completions/max_terminated_length": 2289.0, + "completions/mean_length": 1424.15478515625, + "completions/mean_terminated_length": 1424.15478515625, + "completions/min_length": 1038.0, + "completions/min_terminated_length": 1038.0, + "epoch": 0.2616822429906542, + "grad_norm": 0.5698901414871216, + "kl": 0.03557535447180271, + "learning_rate": 1.901875e-06, + "loss": 0.0045, + "num_tokens": 21198000.0, + "reward": 1.4246412515640259, + "reward_std": 0.07475357502698898, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4246411919593811, + "rewards/correct_reward_func/std": 0.12371546775102615, + "step": 168 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3198.0, + "completions/max_terminated_length": 3198.0, + "completions/mean_length": 1557.857177734375, + "completions/mean_terminated_length": 1557.857177734375, + "completions/min_length": 911.0, + "completions/min_terminated_length": 911.0, + "epoch": 0.2632398753894081, + "grad_norm": 0.5663595795631409, + "kl": 0.03528137691318989, + "learning_rate": 1.90125e-06, + "loss": 0.0162, + "num_tokens": 21334890.0, + "reward": 1.4614077806472778, + "reward_std": 0.06879691779613495, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46140775084495544, + "rewards/correct_reward_func/std": 0.12173257023096085, + "step": 169 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2376.0, + "completions/mean_length": 1575.3095703125, + "completions/mean_terminated_length": 1495.59033203125, + "completions/min_length": 767.0, + "completions/min_terminated_length": 767.0, + "epoch": 0.26479750778816197, + "grad_norm": 0.5297430753707886, + "kl": 0.03137396089732647, + "learning_rate": 1.900625e-06, + "loss": 0.0552, + "num_tokens": 21473114.0, + "reward": 1.4736964702606201, + "reward_std": 0.08518790453672409, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47369638085365295, + "rewards/correct_reward_func/std": 0.15848514437675476, + "step": 170 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2487.0, + "completions/max_terminated_length": 2487.0, + "completions/mean_length": 1467.511962890625, + "completions/mean_terminated_length": 1467.511962890625, + "completions/min_length": 784.0, + "completions/min_terminated_length": 784.0, + "epoch": 0.26635514018691586, + "grad_norm": 0.5714218020439148, + "kl": 0.03465087711811066, + "learning_rate": 1.8999999999999998e-06, + "loss": -0.0139, + "num_tokens": 21602295.0, + "reward": 1.4387915134429932, + "reward_std": 0.07327855378389359, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.438791424036026, + "rewards/correct_reward_func/std": 0.14280220866203308, + "step": 171 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2169.0, + "completions/max_terminated_length": 2169.0, + "completions/mean_length": 1464.6190185546875, + "completions/mean_terminated_length": 1464.6190185546875, + "completions/min_length": 792.0, + "completions/min_terminated_length": 792.0, + "epoch": 0.26791277258566976, + "grad_norm": 0.5421945452690125, + "kl": 0.034480318427085876, + "learning_rate": 1.899375e-06, + "loss": 0.0296, + "num_tokens": 21731377.0, + "reward": 1.4377334117889404, + "reward_std": 0.0828586295247078, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4496382176876068, + "rewards/correct_reward_func/std": 0.13945025205612183, + "step": 172 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2290.0, + "completions/max_terminated_length": 2290.0, + "completions/mean_length": 1374.34521484375, + "completions/mean_terminated_length": 1374.34521484375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "epoch": 0.26947040498442365, + "grad_norm": 0.6028652191162109, + "kl": 0.03403773531317711, + "learning_rate": 1.8987499999999998e-06, + "loss": -0.0284, + "num_tokens": 21852672.0, + "reward": 1.4843207597732544, + "reward_std": 0.0630205050110817, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4843207001686096, + "rewards/correct_reward_func/std": 0.14204466342926025, + "step": 173 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2999.0, + "completions/max_terminated_length": 2999.0, + "completions/mean_length": 1529.8809814453125, + "completions/mean_terminated_length": 1529.8809814453125, + "completions/min_length": 768.0, + "completions/min_terminated_length": 768.0, + "epoch": 0.27102803738317754, + "grad_norm": 0.5906941890716553, + "kl": 0.034525854513049126, + "learning_rate": 1.898125e-06, + "loss": 0.0167, + "num_tokens": 21987254.0, + "reward": 1.3962242603302002, + "reward_std": 0.0881531834602356, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4081288278102875, + "rewards/correct_reward_func/std": 0.08537304401397705, + "step": 174 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3739.0, + "completions/max_terminated_length": 3739.0, + "completions/mean_length": 1501.166748046875, + "completions/mean_terminated_length": 1501.166748046875, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "epoch": 0.27258566978193144, + "grad_norm": 0.5807496905326843, + "kl": 0.03358875773847103, + "learning_rate": 1.8974999999999998e-06, + "loss": 0.0106, + "num_tokens": 22119274.0, + "reward": 1.4818309545516968, + "reward_std": 0.08105891197919846, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4818309247493744, + "rewards/correct_reward_func/std": 0.1239246129989624, + "step": 175 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2070.0, + "completions/max_terminated_length": 2070.0, + "completions/mean_length": 1430.09521484375, + "completions/mean_terminated_length": 1430.09521484375, + "completions/min_length": 860.0, + "completions/min_terminated_length": 860.0, + "epoch": 0.27414330218068533, + "grad_norm": 0.5961090922355652, + "kl": 0.03483774699270725, + "learning_rate": 1.896875e-06, + "loss": -0.0327, + "num_tokens": 22245432.0, + "reward": 1.4600034952163696, + "reward_std": 0.08812181651592255, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4719081223011017, + "rewards/correct_reward_func/std": 0.1560893952846527, + "step": 176 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2218.0, + "completions/mean_length": 1490.65478515625, + "completions/mean_terminated_length": 1409.9156494140625, + "completions/min_length": 850.0, + "completions/min_terminated_length": 850.0, + "epoch": 0.2757009345794392, + "grad_norm": 0.5829291343688965, + "kl": 0.033502571284770966, + "learning_rate": 1.8962499999999998e-06, + "loss": 0.0753, + "num_tokens": 22376581.0, + "reward": 1.4762822389602661, + "reward_std": 0.09881778061389923, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4762822091579437, + "rewards/correct_reward_func/std": 0.15524689853191376, + "step": 177 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2410.0, + "completions/mean_length": 1661.4405517578125, + "completions/mean_terminated_length": 1502.158447265625, + "completions/min_length": 1023.0, + "completions/min_terminated_length": 1023.0, + "epoch": 0.2772585669781931, + "grad_norm": 0.49473848938941956, + "kl": 0.031944600865244865, + "learning_rate": 1.8956249999999997e-06, + "loss": 0.1143, + "num_tokens": 22522250.0, + "reward": 1.4583110809326172, + "reward_std": 0.07724699378013611, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4583110809326172, + "rewards/correct_reward_func/std": 0.15963733196258545, + "step": 178 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2345.0, + "completions/max_terminated_length": 2345.0, + "completions/mean_length": 1503.09521484375, + "completions/mean_terminated_length": 1503.09521484375, + "completions/min_length": 880.0, + "completions/min_terminated_length": 880.0, + "epoch": 0.278816199376947, + "grad_norm": 0.5652268528938293, + "kl": 0.033452507108449936, + "learning_rate": 1.8949999999999999e-06, + "loss": -0.0117, + "num_tokens": 22654690.0, + "reward": 1.539380431175232, + "reward_std": 0.07203835994005203, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5393803119659424, + "rewards/correct_reward_func/std": 0.15730725228786469, + "step": 179 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2153.0, + "completions/max_terminated_length": 2153.0, + "completions/mean_length": 1424.7381591796875, + "completions/mean_terminated_length": 1424.7381591796875, + "completions/min_length": 752.0, + "completions/min_terminated_length": 752.0, + "epoch": 0.2803738317757009, + "grad_norm": 0.6266071796417236, + "kl": 0.03748060762882233, + "learning_rate": 1.8943749999999998e-06, + "loss": 0.0132, + "num_tokens": 22780308.0, + "reward": 1.4268115758895874, + "reward_std": 0.06137494370341301, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4268115758895874, + "rewards/correct_reward_func/std": 0.1579686850309372, + "step": 180 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2288.0, + "completions/mean_length": 1513.3214111328125, + "completions/mean_terminated_length": 1432.8553466796875, + "completions/min_length": 946.0, + "completions/min_terminated_length": 946.0, + "epoch": 0.2819314641744548, + "grad_norm": 0.5787291526794434, + "kl": 0.03337083198130131, + "learning_rate": 1.8937499999999999e-06, + "loss": 0.0331, + "num_tokens": 22913361.0, + "reward": 1.4345508813858032, + "reward_std": 0.08300718665122986, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43455079197883606, + "rewards/correct_reward_func/std": 0.12243600934743881, + "step": 181 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2425.0, + "completions/max_terminated_length": 2425.0, + "completions/mean_length": 1513.0238037109375, + "completions/mean_terminated_length": 1513.0238037109375, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.2834890965732087, + "grad_norm": 0.5958402752876282, + "kl": 0.0339033342897892, + "learning_rate": 1.8931249999999998e-06, + "loss": -0.0335, + "num_tokens": 23046509.0, + "reward": 1.4896963834762573, + "reward_std": 0.074093759059906, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4896962344646454, + "rewards/correct_reward_func/std": 0.18158133327960968, + "step": 182 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2347.0, + "completions/max_terminated_length": 2347.0, + "completions/mean_length": 1505.4405517578125, + "completions/mean_terminated_length": 1505.4405517578125, + "completions/min_length": 1023.0, + "completions/min_terminated_length": 1023.0, + "epoch": 0.2850467289719626, + "grad_norm": 0.5892803072929382, + "kl": 0.03511413745582104, + "learning_rate": 1.8924999999999999e-06, + "loss": -0.0281, + "num_tokens": 23179050.0, + "reward": 1.4773499965667725, + "reward_std": 0.06594084203243256, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4773499667644501, + "rewards/correct_reward_func/std": 0.12919507920742035, + "step": 183 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2235.0, + "completions/mean_length": 1522.84521484375, + "completions/mean_terminated_length": 1442.493896484375, + "completions/min_length": 865.0, + "completions/min_terminated_length": 865.0, + "epoch": 0.2866043613707165, + "grad_norm": 0.5434836149215698, + "kl": 0.034950753673911095, + "learning_rate": 1.891875e-06, + "loss": 0.0603, + "num_tokens": 23312909.0, + "reward": 1.4319921731948853, + "reward_std": 0.09058649092912674, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4438968300819397, + "rewards/correct_reward_func/std": 0.1126542016863823, + "step": 184 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2118.0, + "completions/max_terminated_length": 2118.0, + "completions/mean_length": 1422.1190185546875, + "completions/mean_terminated_length": 1422.1190185546875, + "completions/min_length": 793.0, + "completions/min_terminated_length": 793.0, + "epoch": 0.2881619937694704, + "grad_norm": 0.6052145957946777, + "kl": 0.03563849255442619, + "learning_rate": 1.89125e-06, + "loss": -0.0224, + "num_tokens": 23438439.0, + "reward": 1.508691668510437, + "reward_std": 0.06740865856409073, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5086915493011475, + "rewards/correct_reward_func/std": 0.1557309627532959, + "step": 185 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2163.0, + "completions/max_terminated_length": 2163.0, + "completions/mean_length": 1442.6905517578125, + "completions/mean_terminated_length": 1442.6905517578125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "epoch": 0.2897196261682243, + "grad_norm": 0.5826404094696045, + "kl": 0.03599457070231438, + "learning_rate": 1.890625e-06, + "loss": 0.022, + "num_tokens": 23565397.0, + "reward": 1.44635009765625, + "reward_std": 0.06368335336446762, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44635000824928284, + "rewards/correct_reward_func/std": 0.14564184844493866, + "step": 186 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2051.0, + "completions/max_terminated_length": 2051.0, + "completions/mean_length": 1438.2381591796875, + "completions/mean_terminated_length": 1438.2381591796875, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "epoch": 0.29127725856697817, + "grad_norm": 0.597940981388092, + "kl": 0.0359827596694231, + "learning_rate": 1.89e-06, + "loss": -0.0061, + "num_tokens": 23691981.0, + "reward": 1.4225126504898071, + "reward_std": 0.07259950041770935, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4225126802921295, + "rewards/correct_reward_func/std": 0.14631718397140503, + "step": 187 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2586.0, + "completions/max_terminated_length": 2586.0, + "completions/mean_length": 1509.1785888671875, + "completions/mean_terminated_length": 1509.1785888671875, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "epoch": 0.29283489096573206, + "grad_norm": 0.5778841972351074, + "kl": 0.03521919064223766, + "learning_rate": 1.889375e-06, + "loss": -0.0273, + "num_tokens": 23824830.0, + "reward": 1.4619708061218262, + "reward_std": 0.07388392835855484, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46197065711021423, + "rewards/correct_reward_func/std": 0.13990521430969238, + "step": 188 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1537.8333740234375, + "completions/mean_terminated_length": 1457.66259765625, + "completions/min_length": 970.0, + "completions/min_terminated_length": 970.0, + "epoch": 0.29439252336448596, + "grad_norm": 0.5850031971931458, + "kl": 0.036018045619130135, + "learning_rate": 1.88875e-06, + "loss": 0.078, + "num_tokens": 23959864.0, + "reward": 1.4022135734558105, + "reward_std": 0.06926076114177704, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.402213454246521, + "rewards/correct_reward_func/std": 0.15882909297943115, + "step": 189 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2162.0, + "completions/max_terminated_length": 2162.0, + "completions/mean_length": 1444.0833740234375, + "completions/mean_terminated_length": 1444.0833740234375, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "epoch": 0.29595015576323985, + "grad_norm": 0.6393625736236572, + "kl": 0.03580236993730068, + "learning_rate": 1.888125e-06, + "loss": -0.0305, + "num_tokens": 24087125.0, + "reward": 1.4801563024520874, + "reward_std": 0.0749017521739006, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4801561236381531, + "rewards/correct_reward_func/std": 0.1525711715221405, + "step": 190 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2551.0, + "completions/max_terminated_length": 2551.0, + "completions/mean_length": 1454.8929443359375, + "completions/mean_terminated_length": 1454.8929443359375, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "epoch": 0.29750778816199375, + "grad_norm": 0.6086553931236267, + "kl": 0.036950401961803436, + "learning_rate": 1.8875e-06, + "loss": -0.0006, + "num_tokens": 24215414.0, + "reward": 1.4507030248641968, + "reward_std": 0.07029537856578827, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4507029950618744, + "rewards/correct_reward_func/std": 0.1585504561662674, + "step": 191 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2119.0, + "completions/max_terminated_length": 2119.0, + "completions/mean_length": 1376.3690185546875, + "completions/mean_terminated_length": 1376.3690185546875, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "epoch": 0.29906542056074764, + "grad_norm": 0.6285943984985352, + "kl": 0.03829081356525421, + "learning_rate": 1.886875e-06, + "loss": 0.015, + "num_tokens": 24336927.0, + "reward": 1.422336459159851, + "reward_std": 0.06646425276994705, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.42233631014823914, + "rewards/correct_reward_func/std": 0.12761962413787842, + "step": 192 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2072.0, + "completions/mean_length": 1498.59521484375, + "completions/mean_terminated_length": 1417.9517822265625, + "completions/min_length": 822.0, + "completions/min_terminated_length": 822.0, + "epoch": 0.30062305295950154, + "grad_norm": 0.5861801505088806, + "kl": 0.036135466769337654, + "learning_rate": 1.88625e-06, + "loss": 0.0616, + "num_tokens": 24468491.0, + "reward": 1.4807928800582886, + "reward_std": 0.07590245455503464, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.480792760848999, + "rewards/correct_reward_func/std": 0.18916194140911102, + "step": 193 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2399.0, + "completions/max_terminated_length": 2399.0, + "completions/mean_length": 1527.6785888671875, + "completions/mean_terminated_length": 1527.6785888671875, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "epoch": 0.30218068535825543, + "grad_norm": 0.5360192656517029, + "kl": 0.03715986758470535, + "learning_rate": 1.885625e-06, + "loss": 0.0182, + "num_tokens": 24602846.0, + "reward": 1.4494075775146484, + "reward_std": 0.06413312256336212, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44940757751464844, + "rewards/correct_reward_func/std": 0.14744633436203003, + "step": 194 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2146.0, + "completions/max_terminated_length": 2146.0, + "completions/mean_length": 1480.047607421875, + "completions/mean_terminated_length": 1480.047607421875, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "epoch": 0.3037383177570093, + "grad_norm": 0.6489185094833374, + "kl": 0.037878649309277534, + "learning_rate": 1.885e-06, + "loss": -0.0068, + "num_tokens": 24733212.0, + "reward": 1.4438209533691406, + "reward_std": 0.05984492227435112, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44382089376449585, + "rewards/correct_reward_func/std": 0.11844155192375183, + "step": 195 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2308.0, + "completions/max_terminated_length": 2308.0, + "completions/mean_length": 1502.5714111328125, + "completions/mean_terminated_length": 1502.5714111328125, + "completions/min_length": 806.0, + "completions/min_terminated_length": 806.0, + "epoch": 0.3052959501557632, + "grad_norm": 0.6016018390655518, + "kl": 0.037760429084300995, + "learning_rate": 1.8843749999999999e-06, + "loss": -0.0091, + "num_tokens": 24865398.0, + "reward": 1.4562833309173584, + "reward_std": 0.0823647603392601, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46818795800209045, + "rewards/correct_reward_func/std": 0.15202713012695312, + "step": 196 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2920.0, + "completions/max_terminated_length": 2920.0, + "completions/mean_length": 1469.3214111328125, + "completions/mean_terminated_length": 1469.3214111328125, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "epoch": 0.3068535825545171, + "grad_norm": 0.5995698571205139, + "kl": 0.03654175065457821, + "learning_rate": 1.88375e-06, + "loss": 0.027, + "num_tokens": 24995049.0, + "reward": 1.504838466644287, + "reward_std": 0.07503201067447662, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5048382878303528, + "rewards/correct_reward_func/std": 0.14492768049240112, + "step": 197 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2287.0, + "completions/max_terminated_length": 2287.0, + "completions/mean_length": 1468.3214111328125, + "completions/mean_terminated_length": 1468.3214111328125, + "completions/min_length": 949.0, + "completions/min_terminated_length": 949.0, + "epoch": 0.308411214953271, + "grad_norm": 0.5627298951148987, + "kl": 0.03755324147641659, + "learning_rate": 1.8831249999999999e-06, + "loss": 0.0006, + "num_tokens": 25124256.0, + "reward": 1.4946554899215698, + "reward_std": 0.07795637100934982, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4946552515029907, + "rewards/correct_reward_func/std": 0.18591701984405518, + "step": 198 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4446.0, + "completions/max_terminated_length": 4446.0, + "completions/mean_length": 1489.4761962890625, + "completions/mean_terminated_length": 1489.4761962890625, + "completions/min_length": 688.0, + "completions/min_terminated_length": 688.0, + "epoch": 0.3099688473520249, + "grad_norm": 0.5919142365455627, + "kl": 0.03832128271460533, + "learning_rate": 1.8825e-06, + "loss": 0.0226, + "num_tokens": 25255240.0, + "reward": 1.4337986707687378, + "reward_std": 0.13976813852787018, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.457608163356781, + "rewards/correct_reward_func/std": 0.16218747198581696, + "step": 199 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2012.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1443.702392578125, + "completions/mean_terminated_length": 1443.702392578125, + "completions/min_length": 1023.0, + "completions/min_terminated_length": 1023.0, + "epoch": 0.3115264797507788, + "grad_norm": 0.6555087566375732, + "kl": 0.0495732706040144, + "learning_rate": 1.8818749999999999e-06, + "loss": 0.0299, + "num_tokens": 25382517.0, + "reward": 1.4128016233444214, + "reward_std": 0.0843491479754448, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4247063994407654, + "rewards/correct_reward_func/std": 0.15398363769054413, + "step": 200 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2807.0, + "completions/max_terminated_length": 2807.0, + "completions/mean_length": 1525.0714111328125, + "completions/mean_terminated_length": 1525.0714111328125, + "completions/min_length": 911.0, + "completions/min_terminated_length": 911.0, + "epoch": 0.3130841121495327, + "grad_norm": 0.6016954779624939, + "kl": 0.03820333816111088, + "learning_rate": 1.88125e-06, + "loss": 0.0094, + "num_tokens": 25516815.0, + "reward": 1.4183402061462402, + "reward_std": 0.08215481042861938, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4421497583389282, + "rewards/correct_reward_func/std": 0.12978559732437134, + "step": 201 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2805.0, + "completions/max_terminated_length": 2805.0, + "completions/mean_length": 1495.3095703125, + "completions/mean_terminated_length": 1495.3095703125, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "epoch": 0.3146417445482866, + "grad_norm": 0.6088923811912537, + "kl": 0.039613427594304085, + "learning_rate": 1.8806249999999999e-06, + "loss": 0.0236, + "num_tokens": 25648445.0, + "reward": 1.4182132482528687, + "reward_std": 0.0829334408044815, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4301179349422455, + "rewards/correct_reward_func/std": 0.14303255081176758, + "step": 202 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2009.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1409.511962890625, + "completions/mean_terminated_length": 1409.511962890625, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "epoch": 0.3161993769470405, + "grad_norm": 0.6218343377113342, + "kl": 0.03885827772319317, + "learning_rate": 1.8799999999999998e-06, + "loss": -0.0285, + "num_tokens": 25772772.0, + "reward": 1.5313540697097778, + "reward_std": 0.06517814844846725, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5313540697097778, + "rewards/correct_reward_func/std": 0.16049352288246155, + "step": 203 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2697.0, + "completions/mean_length": 1644.90478515625, + "completions/mean_terminated_length": 1485.219482421875, + "completions/min_length": 888.0, + "completions/min_terminated_length": 888.0, + "epoch": 0.3177570093457944, + "grad_norm": 0.608787477016449, + "kl": 0.037238216027617455, + "learning_rate": 1.879375e-06, + "loss": 0.1147, + "num_tokens": 25916914.0, + "reward": 1.4204800128936768, + "reward_std": 0.1299707591533661, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4442894458770752, + "rewards/correct_reward_func/std": 0.15315347909927368, + "step": 204 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2141.0, + "completions/max_terminated_length": 2141.0, + "completions/mean_length": 1391.2857666015625, + "completions/mean_terminated_length": 1391.2857666015625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.31931464174454827, + "grad_norm": 0.6118940114974976, + "kl": 0.03947138041257858, + "learning_rate": 1.8787499999999998e-06, + "loss": -0.0174, + "num_tokens": 26039794.0, + "reward": 1.4847838878631592, + "reward_std": 0.08492320775985718, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48478370904922485, + "rewards/correct_reward_func/std": 0.173141211271286, + "step": 205 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2368.0, + "completions/max_terminated_length": 2368.0, + "completions/mean_length": 1503.46435546875, + "completions/mean_terminated_length": 1503.46435546875, + "completions/min_length": 1025.0, + "completions/min_terminated_length": 1025.0, + "epoch": 0.32087227414330216, + "grad_norm": 0.5726230144500732, + "kl": 0.03891279548406601, + "learning_rate": 1.878125e-06, + "loss": 0.0236, + "num_tokens": 26172265.0, + "reward": 1.4738762378692627, + "reward_std": 0.06679094582796097, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4738762378692627, + "rewards/correct_reward_func/std": 0.1379547268152237, + "step": 206 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2086.0, + "completions/max_terminated_length": 2086.0, + "completions/mean_length": 1410.6785888671875, + "completions/mean_terminated_length": 1410.6785888671875, + "completions/min_length": 811.0, + "completions/min_terminated_length": 811.0, + "epoch": 0.32242990654205606, + "grad_norm": 0.6023487448692322, + "kl": 0.04066877439618111, + "learning_rate": 1.8774999999999998e-06, + "loss": -0.0013, + "num_tokens": 26296666.0, + "reward": 1.4893161058425903, + "reward_std": 0.06180576980113983, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4893161356449127, + "rewards/correct_reward_func/std": 0.13071846961975098, + "step": 207 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5342.0, + "completions/max_terminated_length": 5342.0, + "completions/mean_length": 1561.0833740234375, + "completions/mean_terminated_length": 1561.0833740234375, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "epoch": 0.32398753894080995, + "grad_norm": 0.5445609092712402, + "kl": 0.03924229182302952, + "learning_rate": 1.876875e-06, + "loss": 0.0128, + "num_tokens": 26433857.0, + "reward": 1.4545553922653198, + "reward_std": 0.047772545367479324, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4545552432537079, + "rewards/correct_reward_func/std": 0.12593813240528107, + "step": 208 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2229.0, + "completions/mean_length": 1578.8690185546875, + "completions/mean_terminated_length": 1499.1927490234375, + "completions/min_length": 763.0, + "completions/min_terminated_length": 763.0, + "epoch": 0.32554517133956384, + "grad_norm": 0.6344649791717529, + "kl": 0.03905314393341541, + "learning_rate": 1.8762499999999998e-06, + "loss": 0.0585, + "num_tokens": 26572404.0, + "reward": 1.366521954536438, + "reward_std": 0.1604805886745453, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669144809246063, + "rewards/correct_reward_func/mean": 0.4022361636161804, + "rewards/correct_reward_func/std": 0.13376381993293762, + "step": 209 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2064.0, + "completions/max_terminated_length": 2064.0, + "completions/mean_length": 1402.7857666015625, + "completions/mean_terminated_length": 1402.7857666015625, + "completions/min_length": 766.0, + "completions/min_terminated_length": 766.0, + "epoch": 0.32710280373831774, + "grad_norm": 0.633216381072998, + "kl": 0.04154348373413086, + "learning_rate": 1.875625e-06, + "loss": -0.0066, + "num_tokens": 26696082.0, + "reward": 1.5711801052093506, + "reward_std": 0.07012538611888885, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5711799263954163, + "rewards/correct_reward_func/std": 0.11415733397006989, + "step": 210 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2280.0, + "completions/max_terminated_length": 2280.0, + "completions/mean_length": 1505.1190185546875, + "completions/mean_terminated_length": 1505.1190185546875, + "completions/min_length": 958.0, + "completions/min_terminated_length": 958.0, + "epoch": 0.32866043613707163, + "grad_norm": 0.5750628113746643, + "kl": 0.040491702035069466, + "learning_rate": 1.8749999999999998e-06, + "loss": 0.0052, + "num_tokens": 26828632.0, + "reward": 1.4506397247314453, + "reward_std": 0.07769308984279633, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46254438161849976, + "rewards/correct_reward_func/std": 0.13594752550125122, + "step": 211 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2156.0, + "completions/max_terminated_length": 2156.0, + "completions/mean_length": 1535.9761962890625, + "completions/mean_terminated_length": 1535.9761962890625, + "completions/min_length": 1069.0, + "completions/min_terminated_length": 1069.0, + "epoch": 0.3302180685358255, + "grad_norm": 0.5685467720031738, + "kl": 0.04025058262050152, + "learning_rate": 1.8743749999999997e-06, + "loss": -0.0142, + "num_tokens": 26963768.0, + "reward": 1.5034323930740356, + "reward_std": 0.06803149729967117, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5034322142601013, + "rewards/correct_reward_func/std": 0.1639764904975891, + "step": 212 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2916.0, + "completions/max_terminated_length": 2916.0, + "completions/mean_length": 1490.21435546875, + "completions/mean_terminated_length": 1490.21435546875, + "completions/min_length": 919.0, + "completions/min_terminated_length": 919.0, + "epoch": 0.3317757009345794, + "grad_norm": 0.6390542387962341, + "kl": 0.0426274798810482, + "learning_rate": 1.8737499999999998e-06, + "loss": -0.0129, + "num_tokens": 27094820.0, + "reward": 1.4857133626937866, + "reward_std": 0.06765501946210861, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48571324348449707, + "rewards/correct_reward_func/std": 0.1724708527326584, + "step": 213 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2195.0, + "completions/max_terminated_length": 2195.0, + "completions/mean_length": 1452.416748046875, + "completions/mean_terminated_length": 1452.416748046875, + "completions/min_length": 581.0, + "completions/min_terminated_length": 581.0, + "epoch": 0.3333333333333333, + "grad_norm": 0.629348635673523, + "kl": 0.04252097010612488, + "learning_rate": 1.8731249999999997e-06, + "loss": -0.0138, + "num_tokens": 27222709.0, + "reward": 1.4405913352966309, + "reward_std": 0.05661017820239067, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44059130549430847, + "rewards/correct_reward_func/std": 0.10197056829929352, + "step": 214 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2295.0, + "completions/max_terminated_length": 2295.0, + "completions/mean_length": 1461.34521484375, + "completions/mean_terminated_length": 1461.34521484375, + "completions/min_length": 956.0, + "completions/min_terminated_length": 956.0, + "epoch": 0.3348909657320872, + "grad_norm": 0.5977749824523926, + "kl": 0.04298440180718899, + "learning_rate": 1.8725e-06, + "loss": -0.0026, + "num_tokens": 27351420.0, + "reward": 1.505555272102356, + "reward_std": 0.07077483087778091, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5055552124977112, + "rewards/correct_reward_func/std": 0.19038523733615875, + "step": 215 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2326.0, + "completions/max_terminated_length": 2326.0, + "completions/mean_length": 1527.8214111328125, + "completions/mean_terminated_length": 1527.8214111328125, + "completions/min_length": 815.0, + "completions/min_terminated_length": 815.0, + "epoch": 0.3364485981308411, + "grad_norm": 0.5859149694442749, + "kl": 0.043821416795253754, + "learning_rate": 1.871875e-06, + "loss": 0.0212, + "num_tokens": 27485739.0, + "reward": 1.4727665185928345, + "reward_std": 0.13684409856796265, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4965760409832001, + "rewards/correct_reward_func/std": 0.15358960628509521, + "step": 216 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2218.0, + "completions/max_terminated_length": 2218.0, + "completions/mean_length": 1495.8929443359375, + "completions/mean_terminated_length": 1495.8929443359375, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "epoch": 0.338006230529595, + "grad_norm": 0.6052335500717163, + "kl": 0.04412714019417763, + "learning_rate": 1.87125e-06, + "loss": 0.0149, + "num_tokens": 27617418.0, + "reward": 1.4542310237884521, + "reward_std": 0.06858990341424942, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.454230934381485, + "rewards/correct_reward_func/std": 0.16202780604362488, + "step": 217 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2493.0, + "completions/max_terminated_length": 2493.0, + "completions/mean_length": 1571.96435546875, + "completions/mean_terminated_length": 1571.96435546875, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "epoch": 0.3395638629283489, + "grad_norm": 0.5771605968475342, + "kl": 0.043260419741272926, + "learning_rate": 1.870625e-06, + "loss": -0.0301, + "num_tokens": 27755499.0, + "reward": 1.4477570056915283, + "reward_std": 0.08448237925767899, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44775694608688354, + "rewards/correct_reward_func/std": 0.15343379974365234, + "step": 218 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2162.0, + "completions/max_terminated_length": 2162.0, + "completions/mean_length": 1507.511962890625, + "completions/mean_terminated_length": 1507.511962890625, + "completions/min_length": 883.0, + "completions/min_terminated_length": 883.0, + "epoch": 0.3411214953271028, + "grad_norm": 0.5883904099464417, + "kl": 0.044709596782922745, + "learning_rate": 1.87e-06, + "loss": -0.0025, + "num_tokens": 27888064.0, + "reward": 1.4478862285614014, + "reward_std": 0.07146090269088745, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.447886198759079, + "rewards/correct_reward_func/std": 0.12358597666025162, + "step": 219 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2433.0, + "completions/max_terminated_length": 2433.0, + "completions/mean_length": 1559.8214111328125, + "completions/mean_terminated_length": 1559.8214111328125, + "completions/min_length": 1039.0, + "completions/min_terminated_length": 1039.0, + "epoch": 0.3426791277258567, + "grad_norm": 0.5822417140007019, + "kl": 0.04406227543950081, + "learning_rate": 1.869375e-06, + "loss": -0.0193, + "num_tokens": 28025029.0, + "reward": 1.5140615701675415, + "reward_std": 0.10227732360363007, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5259661674499512, + "rewards/correct_reward_func/std": 0.19070927798748016, + "step": 220 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3088.0, + "completions/max_terminated_length": 3088.0, + "completions/mean_length": 1581.6309814453125, + "completions/mean_terminated_length": 1581.6309814453125, + "completions/min_length": 245.0, + "completions/min_terminated_length": 245.0, + "epoch": 0.3442367601246106, + "grad_norm": 0.6107174754142761, + "kl": 0.04346361383795738, + "learning_rate": 1.86875e-06, + "loss": -0.0145, + "num_tokens": 28163856.0, + "reward": 1.5085567235946655, + "reward_std": 0.08370744436979294, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5085568428039551, + "rewards/correct_reward_func/std": 0.14021635055541992, + "step": 221 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2224.0, + "completions/max_terminated_length": 2224.0, + "completions/mean_length": 1555.7261962890625, + "completions/mean_terminated_length": 1555.7261962890625, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "epoch": 0.34579439252336447, + "grad_norm": 0.5877107977867126, + "kl": 0.043224770575761795, + "learning_rate": 1.868125e-06, + "loss": -0.0078, + "num_tokens": 28300597.0, + "reward": 1.4819693565368652, + "reward_std": 0.09113749116659164, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.49387404322624207, + "rewards/correct_reward_func/std": 0.19690191745758057, + "step": 222 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2318.0, + "completions/max_terminated_length": 2318.0, + "completions/mean_length": 1568.0714111328125, + "completions/mean_terminated_length": 1568.0714111328125, + "completions/min_length": 577.0, + "completions/min_terminated_length": 577.0, + "epoch": 0.34735202492211836, + "grad_norm": 0.5657203197479248, + "kl": 0.04514329880475998, + "learning_rate": 1.8675e-06, + "loss": -0.0005, + "num_tokens": 28438135.0, + "reward": 1.417351484298706, + "reward_std": 0.08495763689279556, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4173515737056732, + "rewards/correct_reward_func/std": 0.13209807872772217, + "step": 223 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3149.0, + "completions/max_terminated_length": 3149.0, + "completions/mean_length": 1625.6785888671875, + "completions/mean_terminated_length": 1625.6785888671875, + "completions/min_length": 898.0, + "completions/min_terminated_length": 898.0, + "epoch": 0.34890965732087226, + "grad_norm": 0.6052978038787842, + "kl": 0.04441903904080391, + "learning_rate": 1.866875e-06, + "loss": -0.0046, + "num_tokens": 28580818.0, + "reward": 1.4290771484375, + "reward_std": 0.10442011803388596, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44098177552223206, + "rewards/correct_reward_func/std": 0.1553770899772644, + "step": 224 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2469.0, + "completions/max_terminated_length": 2469.0, + "completions/mean_length": 1623.761962890625, + "completions/mean_terminated_length": 1623.761962890625, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "epoch": 0.35046728971962615, + "grad_norm": 0.5626906156539917, + "kl": 0.04204758256673813, + "learning_rate": 1.86625e-06, + "loss": 0.0061, + "num_tokens": 28723454.0, + "reward": 1.496443510055542, + "reward_std": 0.06441661715507507, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49644333124160767, + "rewards/correct_reward_func/std": 0.1379326730966568, + "step": 225 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2326.0, + "completions/mean_length": 1645.666748046875, + "completions/mean_terminated_length": 1566.795166015625, + "completions/min_length": 1062.0, + "completions/min_terminated_length": 1062.0, + "epoch": 0.35202492211838005, + "grad_norm": 0.568545937538147, + "kl": 0.041140057146549225, + "learning_rate": 1.865625e-06, + "loss": 0.0597, + "num_tokens": 28867588.0, + "reward": 1.450238585472107, + "reward_std": 0.105202816426754, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4621433615684509, + "rewards/correct_reward_func/std": 0.13623471558094025, + "step": 226 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2350.0, + "completions/max_terminated_length": 2350.0, + "completions/mean_length": 1559.2261962890625, + "completions/mean_terminated_length": 1559.2261962890625, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "epoch": 0.35358255451713394, + "grad_norm": 0.6394890546798706, + "kl": 0.04375514201819897, + "learning_rate": 1.865e-06, + "loss": -0.0027, + "num_tokens": 29004347.0, + "reward": 1.4285489320755005, + "reward_std": 0.04757591709494591, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.42854899168014526, + "rewards/correct_reward_func/std": 0.12418505549430847, + "step": 227 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2343.0, + "completions/max_terminated_length": 2343.0, + "completions/mean_length": 1529.46435546875, + "completions/mean_terminated_length": 1529.46435546875, + "completions/min_length": 298.0, + "completions/min_terminated_length": 298.0, + "epoch": 0.35514018691588783, + "grad_norm": 0.6031341552734375, + "kl": 0.04356654919683933, + "learning_rate": 1.8643749999999998e-06, + "loss": -0.0296, + "num_tokens": 29138618.0, + "reward": 1.5230813026428223, + "reward_std": 0.09695133566856384, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5230813026428223, + "rewards/correct_reward_func/std": 0.19408383965492249, + "step": 228 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2603.0, + "completions/max_terminated_length": 2603.0, + "completions/mean_length": 1547.21435546875, + "completions/mean_terminated_length": 1547.21435546875, + "completions/min_length": 737.0, + "completions/min_terminated_length": 737.0, + "epoch": 0.35669781931464173, + "grad_norm": 0.5816037654876709, + "kl": 0.04396030865609646, + "learning_rate": 1.86375e-06, + "loss": -0.0083, + "num_tokens": 29274332.0, + "reward": 1.4719088077545166, + "reward_std": 0.07453076541423798, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4719087481498718, + "rewards/correct_reward_func/std": 0.17895452678203583, + "step": 229 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6889.0, + "completions/max_terminated_length": 6889.0, + "completions/mean_length": 1678.75, + "completions/mean_terminated_length": 1678.75, + "completions/min_length": 482.0, + "completions/min_terminated_length": 482.0, + "epoch": 0.3582554517133956, + "grad_norm": 0.5625508427619934, + "kl": 0.04273660108447075, + "learning_rate": 1.8631249999999998e-06, + "loss": -0.0144, + "num_tokens": 29421227.0, + "reward": 1.4578830003738403, + "reward_std": 0.06554538756608963, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45788294076919556, + "rewards/correct_reward_func/std": 0.19993598759174347, + "step": 230 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2497.0, + "completions/mean_length": 1663.2857666015625, + "completions/mean_terminated_length": 1584.62646484375, + "completions/min_length": 1098.0, + "completions/min_terminated_length": 1098.0, + "epoch": 0.3598130841121495, + "grad_norm": 0.6602963209152222, + "kl": 0.042186228558421135, + "learning_rate": 1.8625e-06, + "loss": 0.0662, + "num_tokens": 29566793.0, + "reward": 1.509660005569458, + "reward_std": 0.08291852474212646, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5096598267555237, + "rewards/correct_reward_func/std": 0.140364408493042, + "step": 231 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2407.0, + "completions/max_terminated_length": 2407.0, + "completions/mean_length": 1553.5, + "completions/mean_terminated_length": 1553.5, + "completions/min_length": 869.0, + "completions/min_terminated_length": 869.0, + "epoch": 0.3613707165109034, + "grad_norm": 0.5636409521102905, + "kl": 0.0421723909676075, + "learning_rate": 1.8618749999999999e-06, + "loss": 0.0171, + "num_tokens": 29703263.0, + "reward": 1.4572166204452515, + "reward_std": 0.08119600266218185, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4691213071346283, + "rewards/correct_reward_func/std": 0.16932806372642517, + "step": 232 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2367.0, + "completions/max_terminated_length": 2367.0, + "completions/mean_length": 1605.0833740234375, + "completions/mean_terminated_length": 1605.0833740234375, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "epoch": 0.3629283489096573, + "grad_norm": 0.5694032907485962, + "kl": 0.042043108493089676, + "learning_rate": 1.86125e-06, + "loss": 0.0077, + "num_tokens": 29844144.0, + "reward": 1.4451359510421753, + "reward_std": 0.05986570194363594, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44513583183288574, + "rewards/correct_reward_func/std": 0.10634031891822815, + "step": 233 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2301.0, + "completions/max_terminated_length": 2301.0, + "completions/mean_length": 1589.84521484375, + "completions/mean_terminated_length": 1589.84521484375, + "completions/min_length": 978.0, + "completions/min_terminated_length": 978.0, + "epoch": 0.3644859813084112, + "grad_norm": 0.5871843695640564, + "kl": 0.04350174590945244, + "learning_rate": 1.8606249999999999e-06, + "loss": 0.0386, + "num_tokens": 29983685.0, + "reward": 1.4710910320281982, + "reward_std": 0.08404207974672318, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47109100222587585, + "rewards/correct_reward_func/std": 0.17535626888275146, + "step": 234 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2334.0, + "completions/max_terminated_length": 2334.0, + "completions/mean_length": 1567.0357666015625, + "completions/mean_terminated_length": 1567.0357666015625, + "completions/min_length": 951.0, + "completions/min_terminated_length": 951.0, + "epoch": 0.3660436137071651, + "grad_norm": 0.5786914825439453, + "kl": 0.03941281884908676, + "learning_rate": 1.86e-06, + "loss": 0.0094, + "num_tokens": 30121118.0, + "reward": 1.4925031661987305, + "reward_std": 0.10374214500188828, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5044078826904297, + "rewards/correct_reward_func/std": 0.15605290234088898, + "step": 235 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2486.0, + "completions/max_terminated_length": 2486.0, + "completions/mean_length": 1611.0238037109375, + "completions/mean_terminated_length": 1611.0238037109375, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "epoch": 0.367601246105919, + "grad_norm": 0.5391596555709839, + "kl": 0.0409199483692646, + "learning_rate": 1.8593749999999999e-06, + "loss": 0.0334, + "num_tokens": 30262456.0, + "reward": 1.4530733823776245, + "reward_std": 0.06101413816213608, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4530732333660126, + "rewards/correct_reward_func/std": 0.17970231175422668, + "step": 236 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2376.0, + "completions/max_terminated_length": 2376.0, + "completions/mean_length": 1591.416748046875, + "completions/mean_terminated_length": 1591.416748046875, + "completions/min_length": 1109.0, + "completions/min_terminated_length": 1109.0, + "epoch": 0.3691588785046729, + "grad_norm": 0.595643937587738, + "kl": 0.04340810887515545, + "learning_rate": 1.8587499999999998e-06, + "loss": 0.0072, + "num_tokens": 30402333.0, + "reward": 1.46660578250885, + "reward_std": 0.06254373490810394, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46660569310188293, + "rewards/correct_reward_func/std": 0.12236826121807098, + "step": 237 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2291.0, + "completions/max_terminated_length": 2291.0, + "completions/mean_length": 1531.9881591796875, + "completions/mean_terminated_length": 1531.9881591796875, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "epoch": 0.3707165109034268, + "grad_norm": 0.5581408143043518, + "kl": 0.04236830584704876, + "learning_rate": 1.8581249999999999e-06, + "loss": 0.0098, + "num_tokens": 30537140.0, + "reward": 1.5432277917861938, + "reward_std": 0.0626709833741188, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5432276129722595, + "rewards/correct_reward_func/std": 0.14388269186019897, + "step": 238 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2228.0, + "completions/max_terminated_length": 2228.0, + "completions/mean_length": 1550.797607421875, + "completions/mean_terminated_length": 1550.797607421875, + "completions/min_length": 805.0, + "completions/min_terminated_length": 805.0, + "epoch": 0.37227414330218067, + "grad_norm": 0.574525773525238, + "kl": 0.04116277024149895, + "learning_rate": 1.8574999999999998e-06, + "loss": -0.0023, + "num_tokens": 30673653.0, + "reward": 1.4574679136276245, + "reward_std": 0.06034516915678978, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4574679434299469, + "rewards/correct_reward_func/std": 0.12509454786777496, + "step": 239 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2059.0, + "completions/max_terminated_length": 2059.0, + "completions/mean_length": 1503.8929443359375, + "completions/mean_terminated_length": 1503.8929443359375, + "completions/min_length": 1007.0, + "completions/min_terminated_length": 1007.0, + "epoch": 0.37383177570093457, + "grad_norm": 0.5814647674560547, + "kl": 0.04194348491728306, + "learning_rate": 1.856875e-06, + "loss": 0.0009, + "num_tokens": 30806058.0, + "reward": 1.4536018371582031, + "reward_std": 0.0663735568523407, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45360174775123596, + "rewards/correct_reward_func/std": 0.11585589498281479, + "step": 240 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2368.0, + "completions/max_terminated_length": 2368.0, + "completions/mean_length": 1468.6785888671875, + "completions/mean_terminated_length": 1468.6785888671875, + "completions/min_length": 866.0, + "completions/min_terminated_length": 866.0, + "epoch": 0.37538940809968846, + "grad_norm": 0.5900517702102661, + "kl": 0.041691072285175323, + "learning_rate": 1.8562499999999998e-06, + "loss": 0.0052, + "num_tokens": 30935385.0, + "reward": 1.4337111711502075, + "reward_std": 0.0778563842177391, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4456157684326172, + "rewards/correct_reward_func/std": 0.13722439110279083, + "step": 241 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2428.0, + "completions/max_terminated_length": 2428.0, + "completions/mean_length": 1410.9405517578125, + "completions/mean_terminated_length": 1410.9405517578125, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "epoch": 0.37694704049844235, + "grad_norm": 0.617452085018158, + "kl": 0.04172271862626076, + "learning_rate": 1.855625e-06, + "loss": 0.0126, + "num_tokens": 31059784.0, + "reward": 1.4357478618621826, + "reward_std": 0.08216311782598495, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44765254855155945, + "rewards/correct_reward_func/std": 0.13791020214557648, + "step": 242 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2206.0, + "completions/max_terminated_length": 2206.0, + "completions/mean_length": 1451.5833740234375, + "completions/mean_terminated_length": 1451.5833740234375, + "completions/min_length": 883.0, + "completions/min_terminated_length": 883.0, + "epoch": 0.37850467289719625, + "grad_norm": 0.5894716382026672, + "kl": 0.040936123579740524, + "learning_rate": 1.8549999999999998e-06, + "loss": 0.0101, + "num_tokens": 31187819.0, + "reward": 1.533345341682434, + "reward_std": 0.07711285352706909, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5333453416824341, + "rewards/correct_reward_func/std": 0.12535390257835388, + "step": 243 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2403.0, + "completions/max_terminated_length": 2403.0, + "completions/mean_length": 1480.15478515625, + "completions/mean_terminated_length": 1480.15478515625, + "completions/min_length": 954.0, + "completions/min_terminated_length": 954.0, + "epoch": 0.38006230529595014, + "grad_norm": 0.613828182220459, + "kl": 0.043313439935445786, + "learning_rate": 1.854375e-06, + "loss": -0.0173, + "num_tokens": 31318278.0, + "reward": 1.4279972314834595, + "reward_std": 0.06592278927564621, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4279972314834595, + "rewards/correct_reward_func/std": 0.1297278255224228, + "step": 244 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2105.0, + "completions/max_terminated_length": 2105.0, + "completions/mean_length": 1363.1905517578125, + "completions/mean_terminated_length": 1363.1905517578125, + "completions/min_length": 663.0, + "completions/min_terminated_length": 663.0, + "epoch": 0.38161993769470404, + "grad_norm": 0.6236963272094727, + "kl": 0.04276050627231598, + "learning_rate": 1.8537499999999998e-06, + "loss": 0.0313, + "num_tokens": 31438636.0, + "reward": 1.4753836393356323, + "reward_std": 0.09151271730661392, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.48728832602500916, + "rewards/correct_reward_func/std": 0.1840963512659073, + "step": 245 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2239.0, + "completions/mean_length": 1526.7857666015625, + "completions/mean_terminated_length": 1446.48193359375, + "completions/min_length": 631.0, + "completions/min_terminated_length": 631.0, + "epoch": 0.38317757009345793, + "grad_norm": 0.5810291171073914, + "kl": 0.04004097357392311, + "learning_rate": 1.8531249999999997e-06, + "loss": 0.0546, + "num_tokens": 31573048.0, + "reward": 1.4642269611358643, + "reward_std": 0.07142822444438934, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46422696113586426, + "rewards/correct_reward_func/std": 0.13529928028583527, + "step": 246 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2306.0, + "completions/max_terminated_length": 2306.0, + "completions/mean_length": 1418.5595703125, + "completions/mean_terminated_length": 1418.5595703125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "epoch": 0.3847352024922118, + "grad_norm": 0.594132125377655, + "kl": 0.04153658635914326, + "learning_rate": 1.8525e-06, + "loss": 0.0037, + "num_tokens": 31698363.0, + "reward": 1.4876208305358887, + "reward_std": 0.05413410812616348, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4876207709312439, + "rewards/correct_reward_func/std": 0.134954035282135, + "step": 247 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2087.0, + "completions/max_terminated_length": 2087.0, + "completions/mean_length": 1418.4285888671875, + "completions/mean_terminated_length": 1418.4285888671875, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "epoch": 0.3862928348909657, + "grad_norm": 0.6255624890327454, + "kl": 0.04057171009480953, + "learning_rate": 1.851875e-06, + "loss": 0.0215, + "num_tokens": 31823583.0, + "reward": 1.5005978345870972, + "reward_std": 0.07266637682914734, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5125024914741516, + "rewards/correct_reward_func/std": 0.1100957989692688, + "step": 248 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2308.0, + "completions/max_terminated_length": 2308.0, + "completions/mean_length": 1451.107177734375, + "completions/mean_terminated_length": 1451.107177734375, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "epoch": 0.3878504672897196, + "grad_norm": 0.5847201943397522, + "kl": 0.04207037389278412, + "learning_rate": 1.85125e-06, + "loss": 0.0091, + "num_tokens": 31951524.0, + "reward": 1.4142539501190186, + "reward_std": 0.05806390568614006, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.41425377130508423, + "rewards/correct_reward_func/std": 0.11380590498447418, + "step": 249 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2403.0, + "completions/max_terminated_length": 2403.0, + "completions/mean_length": 1413.1905517578125, + "completions/mean_terminated_length": 1413.1905517578125, + "completions/min_length": 594.0, + "completions/min_terminated_length": 594.0, + "epoch": 0.3894080996884735, + "grad_norm": 0.6250362396240234, + "kl": 0.04115402512252331, + "learning_rate": 1.850625e-06, + "loss": -0.0033, + "num_tokens": 32076256.0, + "reward": 1.4924920797348022, + "reward_std": 0.08884865790605545, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5043967962265015, + "rewards/correct_reward_func/std": 0.13221827149391174, + "step": 250 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2204.0, + "completions/max_terminated_length": 2204.0, + "completions/mean_length": 1406.2261962890625, + "completions/mean_terminated_length": 1406.2261962890625, + "completions/min_length": 703.0, + "completions/min_terminated_length": 703.0, + "epoch": 0.3909657320872274, + "grad_norm": 0.6414059996604919, + "kl": 0.0399419330060482, + "learning_rate": 1.85e-06, + "loss": 0.0096, + "num_tokens": 32200295.0, + "reward": 1.4972379207611084, + "reward_std": 0.07513421773910522, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4972379803657532, + "rewards/correct_reward_func/std": 0.14758117496967316, + "step": 251 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3084.0, + "completions/max_terminated_length": 3084.0, + "completions/mean_length": 1468.4285888671875, + "completions/mean_terminated_length": 1468.4285888671875, + "completions/min_length": 940.0, + "completions/min_terminated_length": 940.0, + "epoch": 0.3925233644859813, + "grad_norm": 0.5872684717178345, + "kl": 0.04071245715022087, + "learning_rate": 1.849375e-06, + "loss": -0.0157, + "num_tokens": 32329739.0, + "reward": 1.4793757200241089, + "reward_std": 0.07921571284532547, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4793757498264313, + "rewards/correct_reward_func/std": 0.14062678813934326, + "step": 252 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1970.0, + "completions/max_terminated_length": 1970.0, + "completions/mean_length": 1309.952392578125, + "completions/mean_terminated_length": 1309.952392578125, + "completions/min_length": 727.0, + "completions/min_terminated_length": 727.0, + "epoch": 0.3940809968847352, + "grad_norm": 0.6398329138755798, + "kl": 0.042467374354600906, + "learning_rate": 1.8487499999999999e-06, + "loss": -0.001, + "num_tokens": 32445493.0, + "reward": 1.4916237592697144, + "reward_std": 0.10208263248205185, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5035284757614136, + "rewards/correct_reward_func/std": 0.2051754891872406, + "step": 253 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2700.0, + "completions/mean_length": 1518.261962890625, + "completions/mean_terminated_length": 1437.8553466796875, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "epoch": 0.3956386292834891, + "grad_norm": 0.6442155838012695, + "kl": 0.04048139229416847, + "learning_rate": 1.848125e-06, + "loss": 0.0944, + "num_tokens": 32579225.0, + "reward": 1.4657591581344604, + "reward_std": 0.10077626258134842, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4776638448238373, + "rewards/correct_reward_func/std": 0.15037497878074646, + "step": 254 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2150.0, + "completions/max_terminated_length": 2150.0, + "completions/mean_length": 1396.46435546875, + "completions/mean_terminated_length": 1396.46435546875, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "epoch": 0.397196261682243, + "grad_norm": 0.5844140648841858, + "kl": 0.03995479829609394, + "learning_rate": 1.8474999999999999e-06, + "loss": -0.0318, + "num_tokens": 32702402.0, + "reward": 1.4533027410507202, + "reward_std": 0.11024706810712814, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4652075171470642, + "rewards/correct_reward_func/std": 0.15434937179088593, + "step": 255 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2225.0, + "completions/max_terminated_length": 2225.0, + "completions/mean_length": 1392.0357666015625, + "completions/mean_terminated_length": 1392.0357666015625, + "completions/min_length": 937.0, + "completions/min_terminated_length": 937.0, + "epoch": 0.3987538940809969, + "grad_norm": 0.6156937479972839, + "kl": 0.04135890118777752, + "learning_rate": 1.846875e-06, + "loss": -0.0274, + "num_tokens": 32825423.0, + "reward": 1.4397926330566406, + "reward_std": 0.13189160823822021, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.46360212564468384, + "rewards/correct_reward_func/std": 0.11824122816324234, + "step": 256 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2059.0, + "completions/max_terminated_length": 2059.0, + "completions/mean_length": 1323.09521484375, + "completions/mean_terminated_length": 1323.09521484375, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "epoch": 0.40031152647975077, + "grad_norm": 0.6653797626495361, + "kl": 0.041794365271925926, + "learning_rate": 1.84625e-06, + "loss": 0.0155, + "num_tokens": 32942683.0, + "reward": 1.4287505149841309, + "reward_std": 0.10937704890966415, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.45256009697914124, + "rewards/correct_reward_func/std": 0.1507876217365265, + "step": 257 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2162.0, + "completions/max_terminated_length": 2162.0, + "completions/mean_length": 1432.6905517578125, + "completions/mean_terminated_length": 1432.6905517578125, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "epoch": 0.40186915887850466, + "grad_norm": 0.5770981311798096, + "kl": 0.04339606128633022, + "learning_rate": 1.845625e-06, + "loss": -0.0066, + "num_tokens": 33068987.0, + "reward": 1.435407280921936, + "reward_std": 0.10868566483259201, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.459216833114624, + "rewards/correct_reward_func/std": 0.13378530740737915, + "step": 258 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2165.0, + "completions/max_terminated_length": 2165.0, + "completions/mean_length": 1342.0, + "completions/mean_terminated_length": 1342.0, + "completions/min_length": 782.0, + "completions/min_terminated_length": 782.0, + "epoch": 0.40342679127725856, + "grad_norm": 0.5935827493667603, + "kl": 0.0418770182877779, + "learning_rate": 1.845e-06, + "loss": -0.0092, + "num_tokens": 33187595.0, + "reward": 1.4948703050613403, + "reward_std": 0.06175254285335541, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4948703348636627, + "rewards/correct_reward_func/std": 0.14253027737140656, + "step": 259 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2052.0, + "completions/max_terminated_length": 2052.0, + "completions/mean_length": 1342.392822265625, + "completions/mean_terminated_length": 1342.392822265625, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "epoch": 0.40498442367601245, + "grad_norm": 0.6045507192611694, + "kl": 0.04008128307759762, + "learning_rate": 1.844375e-06, + "loss": 0.017, + "num_tokens": 33306416.0, + "reward": 1.4627269506454468, + "reward_std": 0.11455470323562622, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.48653644323349, + "rewards/correct_reward_func/std": 0.13522200286388397, + "step": 260 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2129.0, + "completions/max_terminated_length": 2129.0, + "completions/mean_length": 1361.892822265625, + "completions/mean_terminated_length": 1361.892822265625, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "epoch": 0.40654205607476634, + "grad_norm": 0.6531806588172913, + "kl": 0.043158069252967834, + "learning_rate": 1.84375e-06, + "loss": -0.0002, + "num_tokens": 33426683.0, + "reward": 1.4994860887527466, + "reward_std": 0.056075319647789, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4994860291481018, + "rewards/correct_reward_func/std": 0.13296645879745483, + "step": 261 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2196.0, + "completions/max_terminated_length": 2196.0, + "completions/mean_length": 1350.40478515625, + "completions/mean_terminated_length": 1350.40478515625, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "epoch": 0.40809968847352024, + "grad_norm": 0.6397004723548889, + "kl": 0.042096974328160286, + "learning_rate": 1.8431249999999998e-06, + "loss": 0.0194, + "num_tokens": 33546147.0, + "reward": 1.5182113647460938, + "reward_std": 0.09466809034347534, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5420209169387817, + "rewards/correct_reward_func/std": 0.1314767450094223, + "step": 262 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2366.0, + "completions/max_terminated_length": 2366.0, + "completions/mean_length": 1378.107177734375, + "completions/mean_terminated_length": 1378.107177734375, + "completions/min_length": 854.0, + "completions/min_terminated_length": 854.0, + "epoch": 0.40965732087227413, + "grad_norm": 0.6153465509414673, + "kl": 0.04178653843700886, + "learning_rate": 1.8425e-06, + "loss": 0.0061, + "num_tokens": 33667932.0, + "reward": 1.5274070501327515, + "reward_std": 0.06379646062850952, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5274069309234619, + "rewards/correct_reward_func/std": 0.11842171102762222, + "step": 263 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2146.0, + "completions/max_terminated_length": 2146.0, + "completions/mean_length": 1360.357177734375, + "completions/mean_terminated_length": 1360.357177734375, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "epoch": 0.411214953271028, + "grad_norm": 0.6154939532279968, + "kl": 0.04235922172665596, + "learning_rate": 1.8418749999999998e-06, + "loss": 0.0078, + "num_tokens": 33788220.0, + "reward": 1.4856061935424805, + "reward_std": 0.09970905631780624, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5094156861305237, + "rewards/correct_reward_func/std": 0.11878927052021027, + "step": 264 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2242.0, + "completions/max_terminated_length": 2242.0, + "completions/mean_length": 1453.40478515625, + "completions/mean_terminated_length": 1453.40478515625, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "epoch": 0.4127725856697819, + "grad_norm": 0.5895340442657471, + "kl": 0.041117291897535324, + "learning_rate": 1.84125e-06, + "loss": 0.0121, + "num_tokens": 33916336.0, + "reward": 1.5335173606872559, + "reward_std": 0.06757655739784241, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5335173010826111, + "rewards/correct_reward_func/std": 0.15610700845718384, + "step": 265 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2147.0, + "completions/max_terminated_length": 2147.0, + "completions/mean_length": 1328.607177734375, + "completions/mean_terminated_length": 1328.607177734375, + "completions/min_length": 617.0, + "completions/min_terminated_length": 617.0, + "epoch": 0.4143302180685358, + "grad_norm": 0.608429491519928, + "kl": 0.04289627820253372, + "learning_rate": 1.8406249999999998e-06, + "loss": 0.0158, + "num_tokens": 34033747.0, + "reward": 1.4430692195892334, + "reward_std": 0.05843156576156616, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.443069189786911, + "rewards/correct_reward_func/std": 0.1901571899652481, + "step": 266 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2294.0, + "completions/max_terminated_length": 2294.0, + "completions/mean_length": 1355.8809814453125, + "completions/mean_terminated_length": 1355.8809814453125, + "completions/min_length": 874.0, + "completions/min_terminated_length": 874.0, + "epoch": 0.4158878504672897, + "grad_norm": 0.6520563960075378, + "kl": 0.04428970441222191, + "learning_rate": 1.84e-06, + "loss": 0.0026, + "num_tokens": 34153503.0, + "reward": 1.441379427909851, + "reward_std": 0.11828587204217911, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4651888906955719, + "rewards/correct_reward_func/std": 0.1112525463104248, + "step": 267 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2317.0, + "completions/max_terminated_length": 2317.0, + "completions/mean_length": 1398.202392578125, + "completions/mean_terminated_length": 1398.202392578125, + "completions/min_length": 672.0, + "completions/min_terminated_length": 672.0, + "epoch": 0.4174454828660436, + "grad_norm": 0.6122962236404419, + "kl": 0.043515296652913094, + "learning_rate": 1.8393749999999999e-06, + "loss": -0.0184, + "num_tokens": 34277012.0, + "reward": 1.4698716402053833, + "reward_std": 0.10509771853685379, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4936811327934265, + "rewards/correct_reward_func/std": 0.1476019024848938, + "step": 268 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3903.0, + "completions/max_terminated_length": 3903.0, + "completions/mean_length": 1497.8929443359375, + "completions/mean_terminated_length": 1497.8929443359375, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "epoch": 0.4190031152647975, + "grad_norm": 0.5924271941184998, + "kl": 0.04205969721078873, + "learning_rate": 1.83875e-06, + "loss": 0.0098, + "num_tokens": 34408937.0, + "reward": 1.509368896484375, + "reward_std": 0.09606263041496277, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5212737321853638, + "rewards/correct_reward_func/std": 0.1800881028175354, + "step": 269 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1868.0, + "completions/max_terminated_length": 1868.0, + "completions/mean_length": 1369.9285888671875, + "completions/mean_terminated_length": 1369.9285888671875, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "epoch": 0.4205607476635514, + "grad_norm": 0.6304810047149658, + "kl": 0.04325720854103565, + "learning_rate": 1.8381249999999999e-06, + "loss": 0.0025, + "num_tokens": 34530017.0, + "reward": 1.5110054016113281, + "reward_std": 0.0600060299038887, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5110054016113281, + "rewards/correct_reward_func/std": 0.1884339600801468, + "step": 270 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2724.0, + "completions/max_terminated_length": 2724.0, + "completions/mean_length": 1406.3214111328125, + "completions/mean_terminated_length": 1406.3214111328125, + "completions/min_length": 413.0, + "completions/min_terminated_length": 413.0, + "epoch": 0.4221183800623053, + "grad_norm": 0.6104097366333008, + "kl": 0.041636811569333076, + "learning_rate": 1.8374999999999998e-06, + "loss": -0.0099, + "num_tokens": 34654028.0, + "reward": 1.5112788677215576, + "reward_std": 0.07226122170686722, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5112787485122681, + "rewards/correct_reward_func/std": 0.16975651681423187, + "step": 271 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2353.0, + "completions/max_terminated_length": 2353.0, + "completions/mean_length": 1444.3690185546875, + "completions/mean_terminated_length": 1444.3690185546875, + "completions/min_length": 753.0, + "completions/min_terminated_length": 753.0, + "epoch": 0.4236760124610592, + "grad_norm": 0.6154975295066833, + "kl": 0.045218756422400475, + "learning_rate": 1.8368749999999999e-06, + "loss": 0.012, + "num_tokens": 34781289.0, + "reward": 1.570892333984375, + "reward_std": 0.06964144110679626, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.570892333984375, + "rewards/correct_reward_func/std": 0.17510028183460236, + "step": 272 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2181.0, + "completions/max_terminated_length": 2181.0, + "completions/mean_length": 1431.8333740234375, + "completions/mean_terminated_length": 1431.8333740234375, + "completions/min_length": 311.0, + "completions/min_terminated_length": 311.0, + "epoch": 0.4252336448598131, + "grad_norm": 0.6236024498939514, + "kl": 0.04151295870542526, + "learning_rate": 1.8362499999999998e-06, + "loss": -0.022, + "num_tokens": 34907617.0, + "reward": 1.4521212577819824, + "reward_std": 0.12464414536952972, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.47593072056770325, + "rewards/correct_reward_func/std": 0.20174725353717804, + "step": 273 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2056.0, + "completions/max_terminated_length": 2056.0, + "completions/mean_length": 1358.96435546875, + "completions/mean_terminated_length": 1358.96435546875, + "completions/min_length": 626.0, + "completions/min_terminated_length": 626.0, + "epoch": 0.42679127725856697, + "grad_norm": 0.6278449892997742, + "kl": 0.04468250274658203, + "learning_rate": 1.835625e-06, + "loss": -0.0027, + "num_tokens": 35027644.0, + "reward": 1.4580986499786377, + "reward_std": 0.10002487152814865, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4700033664703369, + "rewards/correct_reward_func/std": 0.13392986357212067, + "step": 274 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2506.0, + "completions/max_terminated_length": 2506.0, + "completions/mean_length": 1500.6190185546875, + "completions/mean_terminated_length": 1500.6190185546875, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "epoch": 0.42834890965732086, + "grad_norm": 0.5943901538848877, + "kl": 0.04206756874918938, + "learning_rate": 1.8349999999999998e-06, + "loss": 0.0157, + "num_tokens": 35159816.0, + "reward": 1.392418622970581, + "reward_std": 0.08610358834266663, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4043233394622803, + "rewards/correct_reward_func/std": 0.11100338399410248, + "step": 275 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2434.0, + "completions/max_terminated_length": 2434.0, + "completions/mean_length": 1480.7261962890625, + "completions/mean_terminated_length": 1480.7261962890625, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "epoch": 0.42990654205607476, + "grad_norm": 0.5723066329956055, + "kl": 0.04631072096526623, + "learning_rate": 1.834375e-06, + "loss": 0.0086, + "num_tokens": 35289993.0, + "reward": 1.4739741086959839, + "reward_std": 0.09586656838655472, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4858788549900055, + "rewards/correct_reward_func/std": 0.15505263209342957, + "step": 276 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0357142857142857, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3679.0, + "completions/mean_length": 1646.666748046875, + "completions/mean_terminated_length": 1404.2469482421875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "epoch": 0.43146417445482865, + "grad_norm": 0.543669581413269, + "kl": 0.03733106330037117, + "learning_rate": 1.8337499999999998e-06, + "loss": 0.182, + "num_tokens": 35434073.0, + "reward": 1.4567782878875732, + "reward_std": 0.13338518142700195, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46868306398391724, + "rewards/correct_reward_func/std": 0.16619150340557098, + "step": 277 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2141.0, + "completions/max_terminated_length": 2141.0, + "completions/mean_length": 1501.5595703125, + "completions/mean_terminated_length": 1501.5595703125, + "completions/min_length": 880.0, + "completions/min_terminated_length": 880.0, + "epoch": 0.43302180685358255, + "grad_norm": 0.6221758723258972, + "kl": 0.04269747622311115, + "learning_rate": 1.8331249999999997e-06, + "loss": 0.0145, + "num_tokens": 35566138.0, + "reward": 1.478410243988037, + "reward_std": 0.0783148929476738, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.49031493067741394, + "rewards/correct_reward_func/std": 0.1501626968383789, + "step": 278 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7459.0, + "completions/max_terminated_length": 7459.0, + "completions/mean_length": 1598.6785888671875, + "completions/mean_terminated_length": 1598.6785888671875, + "completions/min_length": 924.0, + "completions/min_terminated_length": 924.0, + "epoch": 0.43457943925233644, + "grad_norm": 0.5859509706497192, + "kl": 0.04193317890167236, + "learning_rate": 1.8325e-06, + "loss": 0.0074, + "num_tokens": 35706385.0, + "reward": 1.4401181936264038, + "reward_std": 0.0550164058804512, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4401181638240814, + "rewards/correct_reward_func/std": 0.1537192016839981, + "step": 279 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2371.0, + "completions/mean_length": 1499.8095703125, + "completions/mean_terminated_length": 1419.1806640625, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "epoch": 0.43613707165109034, + "grad_norm": 0.595498263835907, + "kl": 0.04362577013671398, + "learning_rate": 1.831875e-06, + "loss": 0.0495, + "num_tokens": 35838297.0, + "reward": 1.4194005727767944, + "reward_std": 0.08994495123624802, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4313051104545593, + "rewards/correct_reward_func/std": 0.14579959213733673, + "step": 280 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3234.0, + "completions/max_terminated_length": 3234.0, + "completions/mean_length": 1566.0357666015625, + "completions/mean_terminated_length": 1566.0357666015625, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "epoch": 0.43769470404984423, + "grad_norm": 0.5680190920829773, + "kl": 0.0422314815223217, + "learning_rate": 1.83125e-06, + "loss": 0.0167, + "num_tokens": 35975880.0, + "reward": 1.475471019744873, + "reward_std": 0.11540813744068146, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.49928048253059387, + "rewards/correct_reward_func/std": 0.1275622546672821, + "step": 281 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2062.0, + "completions/max_terminated_length": 2062.0, + "completions/mean_length": 1490.1309814453125, + "completions/mean_terminated_length": 1490.1309814453125, + "completions/min_length": 944.0, + "completions/min_terminated_length": 944.0, + "epoch": 0.4392523364485981, + "grad_norm": 0.5545284152030945, + "kl": 0.044208116829395294, + "learning_rate": 1.830625e-06, + "loss": -0.0119, + "num_tokens": 36106913.0, + "reward": 1.467574119567871, + "reward_std": 0.09451182931661606, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4794788360595703, + "rewards/correct_reward_func/std": 0.15235535800457, + "step": 282 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2859.0, + "completions/max_terminated_length": 2859.0, + "completions/mean_length": 1569.4405517578125, + "completions/mean_terminated_length": 1569.4405517578125, + "completions/min_length": 964.0, + "completions/min_terminated_length": 964.0, + "epoch": 0.440809968847352, + "grad_norm": 0.5956711173057556, + "kl": 0.04277007095515728, + "learning_rate": 1.83e-06, + "loss": -0.0127, + "num_tokens": 36244560.0, + "reward": 1.465549111366272, + "reward_std": 0.0775151252746582, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4774538576602936, + "rewards/correct_reward_func/std": 0.18763205409049988, + "step": 283 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3013.0, + "completions/max_terminated_length": 3013.0, + "completions/mean_length": 1505.1905517578125, + "completions/mean_terminated_length": 1505.1905517578125, + "completions/min_length": 562.0, + "completions/min_terminated_length": 562.0, + "epoch": 0.4423676012461059, + "grad_norm": 0.613337516784668, + "kl": 0.04313294030725956, + "learning_rate": 1.829375e-06, + "loss": -0.0096, + "num_tokens": 36376960.0, + "reward": 1.51215660572052, + "reward_std": 0.07286342978477478, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5121565461158752, + "rewards/correct_reward_func/std": 0.17705413699150085, + "step": 284 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2579.0, + "completions/max_terminated_length": 2579.0, + "completions/mean_length": 1624.3809814453125, + "completions/mean_terminated_length": 1624.3809814453125, + "completions/min_length": 859.0, + "completions/min_terminated_length": 859.0, + "epoch": 0.4439252336448598, + "grad_norm": 0.5604744553565979, + "kl": 0.04310506582260132, + "learning_rate": 1.82875e-06, + "loss": 0.008, + "num_tokens": 36519414.0, + "reward": 1.4455859661102295, + "reward_std": 0.07997937500476837, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.45749059319496155, + "rewards/correct_reward_func/std": 0.15531718730926514, + "step": 285 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2318.0, + "completions/max_terminated_length": 2318.0, + "completions/mean_length": 1497.3809814453125, + "completions/mean_terminated_length": 1497.3809814453125, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "epoch": 0.4454828660436137, + "grad_norm": 0.6142125129699707, + "kl": 0.04405369609594345, + "learning_rate": 1.828125e-06, + "loss": 0.0213, + "num_tokens": 36651170.0, + "reward": 1.4967048168182373, + "reward_std": 0.05460391938686371, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49670469760894775, + "rewards/correct_reward_func/std": 0.11759886145591736, + "step": 286 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2114.0, + "completions/max_terminated_length": 2114.0, + "completions/mean_length": 1550.3809814453125, + "completions/mean_terminated_length": 1550.3809814453125, + "completions/min_length": 801.0, + "completions/min_terminated_length": 801.0, + "epoch": 0.4470404984423676, + "grad_norm": 0.5982187390327454, + "kl": 0.043098822236061096, + "learning_rate": 1.8274999999999999e-06, + "loss": 0.0141, + "num_tokens": 36787384.0, + "reward": 1.5390734672546387, + "reward_std": 0.07396355271339417, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5390734672546387, + "rewards/correct_reward_func/std": 0.15539847314357758, + "step": 287 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2471.0, + "completions/mean_length": 1590.9285888671875, + "completions/mean_terminated_length": 1511.3975830078125, + "completions/min_length": 472.0, + "completions/min_terminated_length": 472.0, + "epoch": 0.4485981308411215, + "grad_norm": 0.5779162645339966, + "kl": 0.04267328046262264, + "learning_rate": 1.826875e-06, + "loss": 0.0529, + "num_tokens": 36926968.0, + "reward": 1.4345697164535522, + "reward_std": 0.06787623465061188, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4345696270465851, + "rewards/correct_reward_func/std": 0.1536804884672165, + "step": 288 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2519.0, + "completions/max_terminated_length": 2519.0, + "completions/mean_length": 1572.46435546875, + "completions/mean_terminated_length": 1572.46435546875, + "completions/min_length": 619.0, + "completions/min_terminated_length": 619.0, + "epoch": 0.4501557632398754, + "grad_norm": 0.5400437116622925, + "kl": 0.0447152704000473, + "learning_rate": 1.8262499999999999e-06, + "loss": -0.0149, + "num_tokens": 37065001.0, + "reward": 1.4242392778396606, + "reward_std": 0.0718853771686554, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.42423921823501587, + "rewards/correct_reward_func/std": 0.12876558303833008, + "step": 289 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2585.0, + "completions/max_terminated_length": 2585.0, + "completions/mean_length": 1544.71435546875, + "completions/mean_terminated_length": 1544.71435546875, + "completions/min_length": 554.0, + "completions/min_terminated_length": 554.0, + "epoch": 0.4517133956386293, + "grad_norm": 0.5619280338287354, + "kl": 0.044344568625092506, + "learning_rate": 1.825625e-06, + "loss": 0.0099, + "num_tokens": 37200853.0, + "reward": 1.4639300107955933, + "reward_std": 0.1035037636756897, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.47583478689193726, + "rewards/correct_reward_func/std": 0.17393389344215393, + "step": 290 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2626.0, + "completions/max_terminated_length": 2626.0, + "completions/mean_length": 1503.4761962890625, + "completions/mean_terminated_length": 1503.4761962890625, + "completions/min_length": 825.0, + "completions/min_terminated_length": 825.0, + "epoch": 0.4532710280373832, + "grad_norm": 0.5982638001441956, + "kl": 0.04427545331418514, + "learning_rate": 1.8249999999999999e-06, + "loss": -0.0342, + "num_tokens": 37332971.0, + "reward": 1.4380909204483032, + "reward_std": 0.06856860220432281, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43809083104133606, + "rewards/correct_reward_func/std": 0.13657042384147644, + "step": 291 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2390.0, + "completions/max_terminated_length": 2390.0, + "completions/mean_length": 1529.8214111328125, + "completions/mean_terminated_length": 1529.8214111328125, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "epoch": 0.45482866043613707, + "grad_norm": 0.6071525812149048, + "kl": 0.04384468495845795, + "learning_rate": 1.824375e-06, + "loss": 0.0026, + "num_tokens": 37467494.0, + "reward": 1.4335728883743286, + "reward_std": 0.04877452179789543, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43357276916503906, + "rewards/correct_reward_func/std": 0.14995594322681427, + "step": 292 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2314.0, + "completions/max_terminated_length": 2314.0, + "completions/mean_length": 1446.8690185546875, + "completions/mean_terminated_length": 1446.8690185546875, + "completions/min_length": 732.0, + "completions/min_terminated_length": 732.0, + "epoch": 0.45638629283489096, + "grad_norm": 0.6094366312026978, + "kl": 0.04509362578392029, + "learning_rate": 1.82375e-06, + "loss": 0.0195, + "num_tokens": 37594917.0, + "reward": 1.5306507349014282, + "reward_std": 0.08785208314657211, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5306507349014282, + "rewards/correct_reward_func/std": 0.14832548797130585, + "step": 293 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2716.0, + "completions/max_terminated_length": 2716.0, + "completions/mean_length": 1544.011962890625, + "completions/mean_terminated_length": 1544.011962890625, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.45794392523364486, + "grad_norm": 0.6053863167762756, + "kl": 0.04364632070064545, + "learning_rate": 1.823125e-06, + "loss": -0.0119, + "num_tokens": 37730644.0, + "reward": 1.4835604429244995, + "reward_std": 0.057300373911857605, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4835604131221771, + "rewards/correct_reward_func/std": 0.17512056231498718, + "step": 294 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3292.0, + "completions/max_terminated_length": 3292.0, + "completions/mean_length": 1579.09521484375, + "completions/mean_terminated_length": 1579.09521484375, + "completions/min_length": 795.0, + "completions/min_terminated_length": 795.0, + "epoch": 0.45950155763239875, + "grad_norm": 0.5880759954452515, + "kl": 0.0431599710136652, + "learning_rate": 1.8225e-06, + "loss": -0.0029, + "num_tokens": 37869492.0, + "reward": 1.462117314338684, + "reward_std": 0.05300503969192505, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46211737394332886, + "rewards/correct_reward_func/std": 0.11961612105369568, + "step": 295 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2367.0, + "completions/max_terminated_length": 2367.0, + "completions/mean_length": 1529.5, + "completions/mean_terminated_length": 1529.5, + "completions/min_length": 519.0, + "completions/min_terminated_length": 519.0, + "epoch": 0.46105919003115264, + "grad_norm": 0.5763446092605591, + "kl": 0.04444514401257038, + "learning_rate": 1.8218749999999998e-06, + "loss": 0.0078, + "num_tokens": 38003970.0, + "reward": 1.4610320329666138, + "reward_std": 0.08918090909719467, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46103209257125854, + "rewards/correct_reward_func/std": 0.13395950198173523, + "step": 296 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2607.0, + "completions/mean_length": 1656.46435546875, + "completions/mean_terminated_length": 1577.7227783203125, + "completions/min_length": 929.0, + "completions/min_terminated_length": 929.0, + "epoch": 0.46261682242990654, + "grad_norm": 0.5834442973136902, + "kl": 0.041973644867539406, + "learning_rate": 1.82125e-06, + "loss": 0.0617, + "num_tokens": 38149245.0, + "reward": 1.4773956537246704, + "reward_std": 0.09391757100820541, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.48930031061172485, + "rewards/correct_reward_func/std": 0.1633952558040619, + "step": 297 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2122.0, + "completions/max_terminated_length": 2122.0, + "completions/mean_length": 1549.3929443359375, + "completions/mean_terminated_length": 1549.3929443359375, + "completions/min_length": 818.0, + "completions/min_terminated_length": 818.0, + "epoch": 0.46417445482866043, + "grad_norm": 0.6317084431648254, + "kl": 0.046587640419602394, + "learning_rate": 1.8206249999999998e-06, + "loss": 0.0175, + "num_tokens": 38285280.0, + "reward": 1.449130892753601, + "reward_std": 0.14152653515338898, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.47294026613235474, + "rewards/correct_reward_func/std": 0.14357496798038483, + "step": 298 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2408.0, + "completions/max_terminated_length": 2408.0, + "completions/mean_length": 1490.2738037109375, + "completions/mean_terminated_length": 1490.2738037109375, + "completions/min_length": 257.0, + "completions/min_terminated_length": 257.0, + "epoch": 0.4657320872274143, + "grad_norm": 0.5916226506233215, + "kl": 0.04444451816380024, + "learning_rate": 1.82e-06, + "loss": -0.0187, + "num_tokens": 38416565.0, + "reward": 1.4541789293289185, + "reward_std": 0.04750651866197586, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45417898893356323, + "rewards/correct_reward_func/std": 0.1580149382352829, + "step": 299 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1604.3333740234375, + "completions/mean_terminated_length": 1524.9637451171875, + "completions/min_length": 1040.0, + "completions/min_terminated_length": 1040.0, + "epoch": 0.4672897196261682, + "grad_norm": 0.5734702944755554, + "kl": 0.0429048128426075, + "learning_rate": 1.8193749999999998e-06, + "loss": 0.0538, + "num_tokens": 38557047.0, + "reward": 1.4508525133132935, + "reward_std": 0.12774604558944702, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.47466206550598145, + "rewards/correct_reward_func/std": 0.14542065560817719, + "step": 300 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2414.0, + "completions/max_terminated_length": 2414.0, + "completions/mean_length": 1552.4761962890625, + "completions/mean_terminated_length": 1552.4761962890625, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "epoch": 0.4688473520249221, + "grad_norm": 0.596815288066864, + "kl": 0.043997010216116905, + "learning_rate": 1.81875e-06, + "loss": 0.0025, + "num_tokens": 38693335.0, + "reward": 1.4665279388427734, + "reward_std": 0.0923430472612381, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4784325957298279, + "rewards/correct_reward_func/std": 0.1467050313949585, + "step": 301 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5495.0, + "completions/max_terminated_length": 5495.0, + "completions/mean_length": 1538.1785888671875, + "completions/mean_terminated_length": 1538.1785888671875, + "completions/min_length": 959.0, + "completions/min_terminated_length": 959.0, + "epoch": 0.470404984423676, + "grad_norm": 0.5941459536552429, + "kl": 0.043290507048368454, + "learning_rate": 1.8181249999999999e-06, + "loss": 0.0057, + "num_tokens": 38828434.0, + "reward": 1.5739519596099854, + "reward_std": 0.07381974905729294, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5739518404006958, + "rewards/correct_reward_func/std": 0.1708972305059433, + "step": 302 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2525.0, + "completions/max_terminated_length": 2525.0, + "completions/mean_length": 1535.1190185546875, + "completions/mean_terminated_length": 1535.1190185546875, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.4719626168224299, + "grad_norm": 0.6311259865760803, + "kl": 0.044257769361138344, + "learning_rate": 1.8174999999999998e-06, + "loss": -0.0331, + "num_tokens": 38963522.0, + "reward": 1.4050052165985107, + "reward_std": 0.12261962890625, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.42881447076797485, + "rewards/correct_reward_func/std": 0.15219928324222565, + "step": 303 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2573.0, + "completions/max_terminated_length": 2573.0, + "completions/mean_length": 1518.4405517578125, + "completions/mean_terminated_length": 1518.4405517578125, + "completions/min_length": 951.0, + "completions/min_terminated_length": 951.0, + "epoch": 0.4735202492211838, + "grad_norm": 0.5748085379600525, + "kl": 0.0418586116284132, + "learning_rate": 1.8168749999999999e-06, + "loss": -0.0019, + "num_tokens": 39097101.0, + "reward": 1.4809808731079102, + "reward_std": 0.09693938493728638, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4928855299949646, + "rewards/correct_reward_func/std": 0.14456793665885925, + "step": 304 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2232.0, + "completions/max_terminated_length": 2232.0, + "completions/mean_length": 1455.0595703125, + "completions/mean_terminated_length": 1455.0595703125, + "completions/min_length": 795.0, + "completions/min_terminated_length": 795.0, + "epoch": 0.4750778816199377, + "grad_norm": 0.6262243986129761, + "kl": 0.04568205960094929, + "learning_rate": 1.8162499999999998e-06, + "loss": 0.005, + "num_tokens": 39225266.0, + "reward": 1.5073304176330566, + "reward_std": 0.0670827329158783, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5073302984237671, + "rewards/correct_reward_func/std": 0.12795594334602356, + "step": 305 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2476.0, + "completions/max_terminated_length": 2476.0, + "completions/mean_length": 1483.90478515625, + "completions/mean_terminated_length": 1483.90478515625, + "completions/min_length": 917.0, + "completions/min_terminated_length": 917.0, + "epoch": 0.4766355140186916, + "grad_norm": 0.6198824644088745, + "kl": 0.04370650835335255, + "learning_rate": 1.8156249999999999e-06, + "loss": 0.0063, + "num_tokens": 39356004.0, + "reward": 1.5359094142913818, + "reward_std": 0.04993622750043869, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5359094142913818, + "rewards/correct_reward_func/std": 0.13109326362609863, + "step": 306 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2527.0, + "completions/max_terminated_length": 2527.0, + "completions/mean_length": 1559.5, + "completions/mean_terminated_length": 1559.5, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "epoch": 0.4781931464174455, + "grad_norm": 0.5853797197341919, + "kl": 0.04342350363731384, + "learning_rate": 1.8149999999999998e-06, + "loss": 0.0145, + "num_tokens": 39493110.0, + "reward": 1.4782713651657104, + "reward_std": 0.08605591952800751, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.49017614126205444, + "rewards/correct_reward_func/std": 0.1508086770772934, + "step": 307 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2326.0, + "completions/max_terminated_length": 2326.0, + "completions/mean_length": 1511.09521484375, + "completions/mean_terminated_length": 1511.09521484375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.4797507788161994, + "grad_norm": 0.5664383769035339, + "kl": 0.04395752586424351, + "learning_rate": 1.8143749999999999e-06, + "loss": 0.0426, + "num_tokens": 39625916.0, + "reward": 1.4828287363052368, + "reward_std": 0.06932734698057175, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48282867670059204, + "rewards/correct_reward_func/std": 0.17860376834869385, + "step": 308 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2080.0, + "completions/max_terminated_length": 2080.0, + "completions/mean_length": 1484.916748046875, + "completions/mean_terminated_length": 1484.916748046875, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "epoch": 0.48130841121495327, + "grad_norm": 0.617337167263031, + "kl": 0.043506965041160583, + "learning_rate": 1.8137499999999998e-06, + "loss": -0.0018, + "num_tokens": 39756451.0, + "reward": 1.4927843809127808, + "reward_std": 0.10661379992961884, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.50468909740448, + "rewards/correct_reward_func/std": 0.1741182804107666, + "step": 309 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2651.0, + "completions/max_terminated_length": 2651.0, + "completions/mean_length": 1531.8214111328125, + "completions/mean_terminated_length": 1531.8214111328125, + "completions/min_length": 865.0, + "completions/min_terminated_length": 865.0, + "epoch": 0.48286604361370716, + "grad_norm": 0.635550320148468, + "kl": 0.045443542301654816, + "learning_rate": 1.8131250000000001e-06, + "loss": -0.0145, + "num_tokens": 39891016.0, + "reward": 1.4746626615524292, + "reward_std": 0.062213968485593796, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4746626019477844, + "rewards/correct_reward_func/std": 0.1761779487133026, + "step": 310 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2577.0, + "completions/max_terminated_length": 2577.0, + "completions/mean_length": 1529.7381591796875, + "completions/mean_terminated_length": 1529.7381591796875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.48442367601246106, + "grad_norm": 0.573308527469635, + "kl": 0.04426569677889347, + "learning_rate": 1.8125e-06, + "loss": -0.0208, + "num_tokens": 40025544.0, + "reward": 1.506926417350769, + "reward_std": 0.07785354554653168, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.506926417350769, + "rewards/correct_reward_func/std": 0.15344847738742828, + "step": 311 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2376.0, + "completions/max_terminated_length": 2376.0, + "completions/mean_length": 1534.297607421875, + "completions/mean_terminated_length": 1534.297607421875, + "completions/min_length": 992.0, + "completions/min_terminated_length": 992.0, + "epoch": 0.48598130841121495, + "grad_norm": 0.5884522199630737, + "kl": 0.04365627467632294, + "learning_rate": 1.811875e-06, + "loss": -0.0051, + "num_tokens": 40160329.0, + "reward": 1.5241272449493408, + "reward_std": 0.08640160411596298, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5241272449493408, + "rewards/correct_reward_func/std": 0.1817137748003006, + "step": 312 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2641.0, + "completions/max_terminated_length": 2641.0, + "completions/mean_length": 1530.40478515625, + "completions/mean_terminated_length": 1530.40478515625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "epoch": 0.48753894080996885, + "grad_norm": 0.6008781790733337, + "kl": 0.04319826699793339, + "learning_rate": 1.81125e-06, + "loss": 0.0087, + "num_tokens": 40294919.0, + "reward": 1.5073949098587036, + "reward_std": 0.06965342164039612, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5073949098587036, + "rewards/correct_reward_func/std": 0.17690497636795044, + "step": 313 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2659.0, + "completions/max_terminated_length": 2659.0, + "completions/mean_length": 1450.0, + "completions/mean_terminated_length": 1450.0, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "epoch": 0.48909657320872274, + "grad_norm": 0.728448748588562, + "kl": 0.044476715847849846, + "learning_rate": 1.810625e-06, + "loss": 0.0265, + "num_tokens": 40422653.0, + "reward": 1.4225661754608154, + "reward_std": 0.1585291177034378, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669144809246063, + "rewards/correct_reward_func/mean": 0.458280473947525, + "rewards/correct_reward_func/std": 0.17805902659893036, + "step": 314 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2806.0, + "completions/max_terminated_length": 2806.0, + "completions/mean_length": 1425.8929443359375, + "completions/mean_terminated_length": 1425.8929443359375, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "epoch": 0.49065420560747663, + "grad_norm": 0.6005982160568237, + "kl": 0.043727852404117584, + "learning_rate": 1.81e-06, + "loss": 0.0128, + "num_tokens": 40548212.0, + "reward": 1.447396993637085, + "reward_std": 0.0689636841416359, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4473969638347626, + "rewards/correct_reward_func/std": 0.10822274535894394, + "step": 315 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2858.0, + "completions/mean_length": 1671.4761962890625, + "completions/mean_terminated_length": 1592.9156494140625, + "completions/min_length": 1111.0, + "completions/min_terminated_length": 1111.0, + "epoch": 0.49221183800623053, + "grad_norm": 0.5339775085449219, + "kl": 0.04142884351313114, + "learning_rate": 1.809375e-06, + "loss": 0.0495, + "num_tokens": 40694814.0, + "reward": 1.5396287441253662, + "reward_std": 0.06713546812534332, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5396286845207214, + "rewards/correct_reward_func/std": 0.16389040648937225, + "step": 316 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2193.0, + "completions/max_terminated_length": 2193.0, + "completions/mean_length": 1511.6190185546875, + "completions/mean_terminated_length": 1511.6190185546875, + "completions/min_length": 689.0, + "completions/min_terminated_length": 689.0, + "epoch": 0.4937694704049844, + "grad_norm": 0.5666081309318542, + "kl": 0.04457671754062176, + "learning_rate": 1.80875e-06, + "loss": -0.0035, + "num_tokens": 40827688.0, + "reward": 1.4729593992233276, + "reward_std": 0.06596960127353668, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47295936942100525, + "rewards/correct_reward_func/std": 0.18029561638832092, + "step": 317 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2383.0, + "completions/max_terminated_length": 2383.0, + "completions/mean_length": 1499.1309814453125, + "completions/mean_terminated_length": 1499.1309814453125, + "completions/min_length": 580.0, + "completions/min_terminated_length": 580.0, + "epoch": 0.4953271028037383, + "grad_norm": 0.5493736863136292, + "kl": 0.04210697114467621, + "learning_rate": 1.808125e-06, + "loss": 0.0096, + "num_tokens": 40959555.0, + "reward": 1.4608927965164185, + "reward_std": 0.05956989526748657, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46089282631874084, + "rewards/correct_reward_func/std": 0.13776575028896332, + "step": 318 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2211.0, + "completions/max_terminated_length": 2211.0, + "completions/mean_length": 1471.8214111328125, + "completions/mean_terminated_length": 1471.8214111328125, + "completions/min_length": 996.0, + "completions/min_terminated_length": 996.0, + "epoch": 0.4968847352024922, + "grad_norm": 0.5784661173820496, + "kl": 0.04475216940045357, + "learning_rate": 1.8075e-06, + "loss": 0.003, + "num_tokens": 41089194.0, + "reward": 1.4597842693328857, + "reward_std": 0.06170998513698578, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4597841799259186, + "rewards/correct_reward_func/std": 0.132298082113266, + "step": 319 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2501.0, + "completions/max_terminated_length": 2501.0, + "completions/mean_length": 1545.2381591796875, + "completions/mean_terminated_length": 1545.2381591796875, + "completions/min_length": 461.0, + "completions/min_terminated_length": 461.0, + "epoch": 0.4984423676012461, + "grad_norm": 0.5935900211334229, + "kl": 0.04243394732475281, + "learning_rate": 1.806875e-06, + "loss": 0.0376, + "num_tokens": 41224964.0, + "reward": 1.4219588041305542, + "reward_std": 0.07682619988918304, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.43386340141296387, + "rewards/correct_reward_func/std": 0.117339126765728, + "step": 320 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2246.0, + "completions/max_terminated_length": 2246.0, + "completions/mean_length": 1519.857177734375, + "completions/mean_terminated_length": 1519.857177734375, + "completions/min_length": 846.0, + "completions/min_terminated_length": 846.0, + "epoch": 0.5, + "grad_norm": 0.5818554162979126, + "kl": 0.042232925072312355, + "learning_rate": 1.8062499999999999e-06, + "loss": 0.0097, + "num_tokens": 41358608.0, + "reward": 1.4776239395141602, + "reward_std": 0.04936147853732109, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4776238799095154, + "rewards/correct_reward_func/std": 0.10135854780673981, + "step": 321 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2374.0, + "completions/max_terminated_length": 2374.0, + "completions/mean_length": 1553.5357666015625, + "completions/mean_terminated_length": 1553.5357666015625, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "epoch": 0.5015576323987538, + "grad_norm": 0.5444162487983704, + "kl": 0.04500117152929306, + "learning_rate": 1.805625e-06, + "loss": 0.0176, + "num_tokens": 41494955.0, + "reward": 1.4176223278045654, + "reward_std": 0.08014075458049774, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4295269250869751, + "rewards/correct_reward_func/std": 0.11872898042201996, + "step": 322 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2270.0, + "completions/max_terminated_length": 2270.0, + "completions/mean_length": 1534.8333740234375, + "completions/mean_terminated_length": 1534.8333740234375, + "completions/min_length": 1019.0, + "completions/min_terminated_length": 1019.0, + "epoch": 0.5031152647975078, + "grad_norm": 0.6494289040565491, + "kl": 0.04654599726200104, + "learning_rate": 1.8049999999999999e-06, + "loss": -0.0155, + "num_tokens": 41630079.0, + "reward": 1.550278663635254, + "reward_std": 0.0712866261601448, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5502786040306091, + "rewards/correct_reward_func/std": 0.11994405835866928, + "step": 323 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2504.0, + "completions/max_terminated_length": 2504.0, + "completions/mean_length": 1490.15478515625, + "completions/mean_terminated_length": 1490.15478515625, + "completions/min_length": 819.0, + "completions/min_terminated_length": 819.0, + "epoch": 0.5046728971962616, + "grad_norm": 0.5980774760246277, + "kl": 0.044281333684921265, + "learning_rate": 1.804375e-06, + "loss": -0.0166, + "num_tokens": 41761258.0, + "reward": 1.4597599506378174, + "reward_std": 0.06575565785169601, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4597598910331726, + "rewards/correct_reward_func/std": 0.13967834413051605, + "step": 324 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2893.0, + "completions/max_terminated_length": 2893.0, + "completions/mean_length": 1490.0833740234375, + "completions/mean_terminated_length": 1490.0833740234375, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "epoch": 0.5062305295950156, + "grad_norm": 0.636524498462677, + "kl": 0.04570058174431324, + "learning_rate": 1.8037499999999999e-06, + "loss": 0.0381, + "num_tokens": 41892449.0, + "reward": 1.473886251449585, + "reward_std": 0.08418666571378708, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4738861322402954, + "rewards/correct_reward_func/std": 0.11435925960540771, + "step": 325 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1942.0, + "completions/max_terminated_length": 1942.0, + "completions/mean_length": 1402.40478515625, + "completions/mean_terminated_length": 1402.40478515625, + "completions/min_length": 754.0, + "completions/min_terminated_length": 754.0, + "epoch": 0.5077881619937694, + "grad_norm": 0.6131139397621155, + "kl": 0.04529164917767048, + "learning_rate": 1.803125e-06, + "loss": 0.0006, + "num_tokens": 42016257.0, + "reward": 1.4442152976989746, + "reward_std": 0.06880811601877213, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4442150890827179, + "rewards/correct_reward_func/std": 0.15375681221485138, + "step": 326 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2295.0, + "completions/mean_length": 1520.6785888671875, + "completions/mean_terminated_length": 1440.3011474609375, + "completions/min_length": 868.0, + "completions/min_terminated_length": 868.0, + "epoch": 0.5093457943925234, + "grad_norm": 0.5682876706123352, + "kl": 0.043439922854304314, + "learning_rate": 1.8025e-06, + "loss": 0.0475, + "num_tokens": 42150042.0, + "reward": 1.4249800443649292, + "reward_std": 0.09845460206270218, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.43688473105430603, + "rewards/correct_reward_func/std": 0.1302318572998047, + "step": 327 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2421.0, + "completions/max_terminated_length": 2421.0, + "completions/mean_length": 1349.4285888671875, + "completions/mean_terminated_length": 1349.4285888671875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "epoch": 0.5109034267912772, + "grad_norm": 0.569858729839325, + "kl": 0.043210411444306374, + "learning_rate": 1.8018749999999998e-06, + "loss": -0.0357, + "num_tokens": 42269154.0, + "reward": 1.5177992582321167, + "reward_std": 0.08048205822706223, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5177991986274719, + "rewards/correct_reward_func/std": 0.15029731392860413, + "step": 328 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2338.0, + "completions/mean_length": 1542.7261962890625, + "completions/mean_terminated_length": 1462.6143798828125, + "completions/min_length": 817.0, + "completions/min_terminated_length": 817.0, + "epoch": 0.5124610591900312, + "grad_norm": 0.5456616878509521, + "kl": 0.07378330640494823, + "learning_rate": 1.80125e-06, + "loss": 0.0514, + "num_tokens": 42404617.0, + "reward": 1.4895997047424316, + "reward_std": 0.07681519538164139, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4895995855331421, + "rewards/correct_reward_func/std": 0.2211972177028656, + "step": 329 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1804.0, + "completions/max_terminated_length": 1804.0, + "completions/mean_length": 1356.261962890625, + "completions/mean_terminated_length": 1356.261962890625, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "epoch": 0.514018691588785, + "grad_norm": 0.6183797121047974, + "kl": 0.04728836566209793, + "learning_rate": 1.8006249999999998e-06, + "loss": -0.0147, + "num_tokens": 42524399.0, + "reward": 1.4941986799240112, + "reward_std": 0.06612447649240494, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49419865012168884, + "rewards/correct_reward_func/std": 0.12598052620887756, + "step": 330 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2059.0, + "completions/max_terminated_length": 2059.0, + "completions/mean_length": 1400.3333740234375, + "completions/mean_terminated_length": 1400.3333740234375, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "epoch": 0.5155763239875389, + "grad_norm": 0.564521849155426, + "kl": 0.04574920795857906, + "learning_rate": 1.8e-06, + "loss": 0.0249, + "num_tokens": 42647847.0, + "reward": 1.5129057168960571, + "reward_std": 0.055123478174209595, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5129056572914124, + "rewards/correct_reward_func/std": 0.14364565908908844, + "step": 331 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2623.0, + "completions/mean_length": 1540.7857666015625, + "completions/mean_terminated_length": 1460.6505126953125, + "completions/min_length": 903.0, + "completions/min_terminated_length": 903.0, + "epoch": 0.5171339563862928, + "grad_norm": 0.6232556700706482, + "kl": 0.04752310924232006, + "learning_rate": 1.7993749999999998e-06, + "loss": 0.0535, + "num_tokens": 42783291.0, + "reward": 1.4896059036254883, + "reward_std": 0.07293432950973511, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4896059036254883, + "rewards/correct_reward_func/std": 0.17983748018741608, + "step": 332 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1505.9881591796875, + "completions/mean_terminated_length": 1425.4337158203125, + "completions/min_length": 659.0, + "completions/min_terminated_length": 659.0, + "epoch": 0.5186915887850467, + "grad_norm": 0.5811641216278076, + "kl": 0.043217698112130165, + "learning_rate": 1.79875e-06, + "loss": 0.022, + "num_tokens": 42915812.0, + "reward": 1.4960260391235352, + "reward_std": 0.0619584396481514, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4960259795188904, + "rewards/correct_reward_func/std": 0.13359470665454865, + "step": 333 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2062.0, + "completions/max_terminated_length": 2062.0, + "completions/mean_length": 1459.607177734375, + "completions/mean_terminated_length": 1459.607177734375, + "completions/min_length": 905.0, + "completions/min_terminated_length": 905.0, + "epoch": 0.5202492211838006, + "grad_norm": 0.5979028344154358, + "kl": 0.04545888490974903, + "learning_rate": 1.7981249999999998e-06, + "loss": 0.0155, + "num_tokens": 43044449.0, + "reward": 1.4552279710769653, + "reward_std": 0.06798920035362244, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45522791147232056, + "rewards/correct_reward_func/std": 0.12315916270017624, + "step": 334 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2271.0, + "completions/max_terminated_length": 2271.0, + "completions/mean_length": 1457.8690185546875, + "completions/mean_terminated_length": 1457.8690185546875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.5218068535825545, + "grad_norm": 0.5863680839538574, + "kl": 0.045693760737776756, + "learning_rate": 1.7975e-06, + "loss": -0.0156, + "num_tokens": 43173012.0, + "reward": 1.4975252151489258, + "reward_std": 0.0669432058930397, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4975251257419586, + "rewards/correct_reward_func/std": 0.13803456723690033, + "step": 335 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2238.0, + "completions/mean_length": 1608.5357666015625, + "completions/mean_terminated_length": 1529.216796875, + "completions/min_length": 1015.0, + "completions/min_terminated_length": 1015.0, + "epoch": 0.5233644859813084, + "grad_norm": 0.556442379951477, + "kl": 0.04240516573190689, + "learning_rate": 1.7968749999999998e-06, + "loss": 0.0712, + "num_tokens": 43314111.0, + "reward": 1.51832115650177, + "reward_std": 0.1056382805109024, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5302258729934692, + "rewards/correct_reward_func/std": 0.1863190233707428, + "step": 336 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2391.0, + "completions/max_terminated_length": 2391.0, + "completions/mean_length": 1453.6905517578125, + "completions/mean_terminated_length": 1453.6905517578125, + "completions/min_length": 787.0, + "completions/min_terminated_length": 787.0, + "epoch": 0.5249221183800623, + "grad_norm": 0.6169154644012451, + "kl": 0.04544537514448166, + "learning_rate": 1.7962499999999997e-06, + "loss": 0.0458, + "num_tokens": 43442275.0, + "reward": 1.4681013822555542, + "reward_std": 0.06039302796125412, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4681013226509094, + "rewards/correct_reward_func/std": 0.1258929818868637, + "step": 337 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2267.0, + "completions/max_terminated_length": 2267.0, + "completions/mean_length": 1420.6190185546875, + "completions/mean_terminated_length": 1420.6190185546875, + "completions/min_length": 858.0, + "completions/min_terminated_length": 858.0, + "epoch": 0.5264797507788161, + "grad_norm": 0.5530162453651428, + "kl": 0.04460956156253815, + "learning_rate": 1.7956249999999999e-06, + "loss": -0.0063, + "num_tokens": 43567595.0, + "reward": 1.4919437170028687, + "reward_std": 0.05034913867712021, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49194350838661194, + "rewards/correct_reward_func/std": 0.1505471169948578, + "step": 338 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2238.0, + "completions/max_terminated_length": 2238.0, + "completions/mean_length": 1373.5238037109375, + "completions/mean_terminated_length": 1373.5238037109375, + "completions/min_length": 797.0, + "completions/min_terminated_length": 797.0, + "epoch": 0.5280373831775701, + "grad_norm": 0.6410908102989197, + "kl": 0.04922908917069435, + "learning_rate": 1.7949999999999998e-06, + "loss": 0.0041, + "num_tokens": 43688893.0, + "reward": 1.4642657041549683, + "reward_std": 0.047354813665151596, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4642656445503235, + "rewards/correct_reward_func/std": 0.1483275443315506, + "step": 339 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2333.0, + "completions/max_terminated_length": 2333.0, + "completions/mean_length": 1476.666748046875, + "completions/mean_terminated_length": 1476.666748046875, + "completions/min_length": 825.0, + "completions/min_terminated_length": 825.0, + "epoch": 0.5295950155763239, + "grad_norm": 0.5679633021354675, + "kl": 0.04452272690832615, + "learning_rate": 1.7943749999999999e-06, + "loss": 0.0016, + "num_tokens": 43819023.0, + "reward": 1.4857591390609741, + "reward_std": 0.06386592239141464, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48575901985168457, + "rewards/correct_reward_func/std": 0.1048179492354393, + "step": 340 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2557.0, + "completions/mean_length": 1551.3929443359375, + "completions/mean_terminated_length": 1471.385498046875, + "completions/min_length": 634.0, + "completions/min_terminated_length": 634.0, + "epoch": 0.5311526479750779, + "grad_norm": 0.5662598013877869, + "kl": 0.04444094002246857, + "learning_rate": 1.79375e-06, + "loss": 0.07, + "num_tokens": 43955154.0, + "reward": 1.4689278602600098, + "reward_std": 0.06177349016070366, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4689278304576874, + "rewards/correct_reward_func/std": 0.1346137970685959, + "step": 341 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1957.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1437.9285888671875, + "completions/mean_terminated_length": 1437.9285888671875, + "completions/min_length": 887.0, + "completions/min_terminated_length": 887.0, + "epoch": 0.5327102803738317, + "grad_norm": 0.6565880179405212, + "kl": 0.04598667845129967, + "learning_rate": 1.793125e-06, + "loss": -0.0001, + "num_tokens": 44081778.0, + "reward": 1.5010194778442383, + "reward_std": 0.06478109210729599, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5010193586349487, + "rewards/correct_reward_func/std": 0.18090546131134033, + "step": 342 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2467.0, + "completions/mean_length": 1543.09521484375, + "completions/mean_terminated_length": 1462.9879150390625, + "completions/min_length": 538.0, + "completions/min_terminated_length": 538.0, + "epoch": 0.5342679127725857, + "grad_norm": 0.6114045977592468, + "kl": 0.043492890894412994, + "learning_rate": 1.7925e-06, + "loss": 0.0833, + "num_tokens": 44217458.0, + "reward": 1.45277738571167, + "reward_std": 0.11382251977920532, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.47658684849739075, + "rewards/correct_reward_func/std": 0.16254591941833496, + "step": 343 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2252.0, + "completions/mean_length": 1516.1429443359375, + "completions/mean_terminated_length": 1435.7108154296875, + "completions/min_length": 918.0, + "completions/min_terminated_length": 918.0, + "epoch": 0.5358255451713395, + "grad_norm": 0.5839765667915344, + "kl": 0.04525020532310009, + "learning_rate": 1.791875e-06, + "loss": 0.0436, + "num_tokens": 44350790.0, + "reward": 1.4360476732254028, + "reward_std": 0.06001214683055878, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43604764342308044, + "rewards/correct_reward_func/std": 0.1315266191959381, + "step": 344 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2188.0, + "completions/max_terminated_length": 2188.0, + "completions/mean_length": 1429.1785888671875, + "completions/mean_terminated_length": 1429.1785888671875, + "completions/min_length": 958.0, + "completions/min_terminated_length": 958.0, + "epoch": 0.5373831775700935, + "grad_norm": 0.567746639251709, + "kl": 0.04449603334069252, + "learning_rate": 1.79125e-06, + "loss": 0.0274, + "num_tokens": 44476895.0, + "reward": 1.4312893152236938, + "reward_std": 0.0564139224588871, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43128931522369385, + "rewards/correct_reward_func/std": 0.13854992389678955, + "step": 345 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 1952.0, + "completions/mean_length": 1476.25, + "completions/mean_terminated_length": 1395.3372802734375, + "completions/min_length": 743.0, + "completions/min_terminated_length": 743.0, + "epoch": 0.5389408099688473, + "grad_norm": 0.5948997735977173, + "kl": 0.04478558525443077, + "learning_rate": 1.790625e-06, + "loss": 0.048, + "num_tokens": 44606786.0, + "reward": 1.4746853113174438, + "reward_std": 0.08000284433364868, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4746852219104767, + "rewards/correct_reward_func/std": 0.1414322406053543, + "step": 346 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2192.0, + "completions/mean_length": 1520.8929443359375, + "completions/mean_terminated_length": 1440.51806640625, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "epoch": 0.5404984423676013, + "grad_norm": 0.5873611569404602, + "kl": 0.04373046010732651, + "learning_rate": 1.79e-06, + "loss": 0.0608, + "num_tokens": 44740553.0, + "reward": 1.3899791240692139, + "reward_std": 0.1021641418337822, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4018838405609131, + "rewards/correct_reward_func/std": 0.10966146737337112, + "step": 347 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3150.0, + "completions/mean_length": 1603.3809814453125, + "completions/mean_terminated_length": 1524.0, + "completions/min_length": 765.0, + "completions/min_terminated_length": 765.0, + "epoch": 0.5420560747663551, + "grad_norm": 0.6007145643234253, + "kl": 0.042141517624258995, + "learning_rate": 1.789375e-06, + "loss": 0.0734, + "num_tokens": 44881519.0, + "reward": 1.516649842262268, + "reward_std": 0.06937997788190842, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5166497230529785, + "rewards/correct_reward_func/std": 0.13787348568439484, + "step": 348 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2165.0, + "completions/mean_length": 1627.0357666015625, + "completions/mean_terminated_length": 1466.91455078125, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.543613707165109, + "grad_norm": 0.5334845185279846, + "kl": 0.04171426221728325, + "learning_rate": 1.78875e-06, + "loss": 0.142, + "num_tokens": 45024094.0, + "reward": 1.4035788774490356, + "reward_std": 0.07490548491477966, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.40357890725135803, + "rewards/correct_reward_func/std": 0.11934227496385574, + "step": 349 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2858.0, + "completions/max_terminated_length": 2858.0, + "completions/mean_length": 1388.5238037109375, + "completions/mean_terminated_length": 1388.5238037109375, + "completions/min_length": 853.0, + "completions/min_terminated_length": 853.0, + "epoch": 0.5451713395638629, + "grad_norm": 0.5753508806228638, + "kl": 0.04579620808362961, + "learning_rate": 1.788125e-06, + "loss": -0.003, + "num_tokens": 45146580.0, + "reward": 1.4059325456619263, + "reward_std": 0.05845046043395996, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4059324264526367, + "rewards/correct_reward_func/std": 0.14846326410770416, + "step": 350 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2026.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1354.6309814453125, + "completions/mean_terminated_length": 1354.6309814453125, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "epoch": 0.5467289719626168, + "grad_norm": 0.6357160210609436, + "kl": 0.045284371823072433, + "learning_rate": 1.7875e-06, + "loss": -0.0187, + "num_tokens": 45266273.0, + "reward": 1.4549281597137451, + "reward_std": 0.07358434051275253, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45492807030677795, + "rewards/correct_reward_func/std": 0.12501084804534912, + "step": 351 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2012.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1370.2261962890625, + "completions/mean_terminated_length": 1370.2261962890625, + "completions/min_length": 884.0, + "completions/min_terminated_length": 884.0, + "epoch": 0.5482866043613707, + "grad_norm": 0.6501032114028931, + "kl": 0.04532886669039726, + "learning_rate": 1.786875e-06, + "loss": 0.0287, + "num_tokens": 45387228.0, + "reward": 1.509010910987854, + "reward_std": 0.09563028067350388, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5328204035758972, + "rewards/correct_reward_func/std": 0.1301979273557663, + "step": 352 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2229.0, + "completions/mean_length": 1504.21435546875, + "completions/mean_terminated_length": 1423.6385498046875, + "completions/min_length": 701.0, + "completions/min_terminated_length": 701.0, + "epoch": 0.5498442367601246, + "grad_norm": 0.614535391330719, + "kl": 0.04563060216605663, + "learning_rate": 1.7862499999999998e-06, + "loss": 0.0562, + "num_tokens": 45519630.0, + "reward": 1.5245028734207153, + "reward_std": 0.09191560745239258, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5245028138160706, + "rewards/correct_reward_func/std": 0.19058886170387268, + "step": 353 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2106.0, + "completions/mean_length": 1371.3690185546875, + "completions/mean_terminated_length": 1289.1927490234375, + "completions/min_length": 747.0, + "completions/min_terminated_length": 747.0, + "epoch": 0.5514018691588785, + "grad_norm": 0.5684062242507935, + "kl": 0.04687408730387688, + "learning_rate": 1.785625e-06, + "loss": 0.1009, + "num_tokens": 45640585.0, + "reward": 1.423880696296692, + "reward_std": 0.08402802050113678, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.43578535318374634, + "rewards/correct_reward_func/std": 0.14543381333351135, + "step": 354 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2220.0, + "completions/max_terminated_length": 2220.0, + "completions/mean_length": 1358.0238037109375, + "completions/mean_terminated_length": 1358.0238037109375, + "completions/min_length": 723.0, + "completions/min_terminated_length": 723.0, + "epoch": 0.5529595015576324, + "grad_norm": 0.6448848843574524, + "kl": 0.048309145495295525, + "learning_rate": 1.7849999999999999e-06, + "loss": -0.0182, + "num_tokens": 45760533.0, + "reward": 1.4921796321868896, + "reward_std": 0.07159780710935593, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4921795129776001, + "rewards/correct_reward_func/std": 0.15320508182048798, + "step": 355 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2087.0, + "completions/max_terminated_length": 2087.0, + "completions/mean_length": 1354.71435546875, + "completions/mean_terminated_length": 1354.71435546875, + "completions/min_length": 480.0, + "completions/min_terminated_length": 480.0, + "epoch": 0.5545171339563862, + "grad_norm": 0.6176822185516357, + "kl": 0.04802674613893032, + "learning_rate": 1.784375e-06, + "loss": -0.0059, + "num_tokens": 45880269.0, + "reward": 1.4876474142074585, + "reward_std": 0.09487791359424591, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4995521306991577, + "rewards/correct_reward_func/std": 0.13108977675437927, + "step": 356 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2128.0, + "completions/max_terminated_length": 2128.0, + "completions/mean_length": 1344.9285888671875, + "completions/mean_terminated_length": 1344.9285888671875, + "completions/min_length": 769.0, + "completions/min_terminated_length": 769.0, + "epoch": 0.5560747663551402, + "grad_norm": 0.6490213871002197, + "kl": 0.0472539346665144, + "learning_rate": 1.7837499999999999e-06, + "loss": 0.0013, + "num_tokens": 45999249.0, + "reward": 1.44069242477417, + "reward_std": 0.11679985374212265, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.46450188755989075, + "rewards/correct_reward_func/std": 0.13278451561927795, + "step": 357 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2629.0, + "completions/max_terminated_length": 2629.0, + "completions/mean_length": 1377.7857666015625, + "completions/mean_terminated_length": 1377.7857666015625, + "completions/min_length": 887.0, + "completions/min_terminated_length": 887.0, + "epoch": 0.557632398753894, + "grad_norm": 0.6063095331192017, + "kl": 0.045867305248975754, + "learning_rate": 1.783125e-06, + "loss": -0.0135, + "num_tokens": 46121055.0, + "reward": 1.4912810325622559, + "reward_std": 0.07250796258449554, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49128106236457825, + "rewards/correct_reward_func/std": 0.12492024898529053, + "step": 358 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2068.0, + "completions/max_terminated_length": 2068.0, + "completions/mean_length": 1362.761962890625, + "completions/mean_terminated_length": 1362.761962890625, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "epoch": 0.559190031152648, + "grad_norm": 0.6036370992660522, + "kl": 0.047231562435626984, + "learning_rate": 1.7824999999999999e-06, + "loss": -0.0009, + "num_tokens": 46241479.0, + "reward": 1.4975894689559937, + "reward_std": 0.05849050357937813, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4975893795490265, + "rewards/correct_reward_func/std": 0.18169310688972473, + "step": 359 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1936.0, + "completions/max_terminated_length": 1936.0, + "completions/mean_length": 1369.9881591796875, + "completions/mean_terminated_length": 1369.9881591796875, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "epoch": 0.5607476635514018, + "grad_norm": 0.582613468170166, + "kl": 0.04558840952813625, + "learning_rate": 1.781875e-06, + "loss": 0.0201, + "num_tokens": 46362546.0, + "reward": 1.4481348991394043, + "reward_std": 0.09183409065008163, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46003949642181396, + "rewards/correct_reward_func/std": 0.13477934896945953, + "step": 360 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2250.0, + "completions/max_terminated_length": 2250.0, + "completions/mean_length": 1380.261962890625, + "completions/mean_terminated_length": 1380.261962890625, + "completions/min_length": 837.0, + "completions/min_terminated_length": 837.0, + "epoch": 0.5623052959501558, + "grad_norm": 0.6290069818496704, + "kl": 0.04558514803647995, + "learning_rate": 1.7812499999999999e-06, + "loss": 0.0065, + "num_tokens": 46484542.0, + "reward": 1.4386088848114014, + "reward_std": 0.09025963395833969, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4505135715007782, + "rewards/correct_reward_func/std": 0.12712764739990234, + "step": 361 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2111.0, + "completions/max_terminated_length": 2111.0, + "completions/mean_length": 1388.9881591796875, + "completions/mean_terminated_length": 1388.9881591796875, + "completions/min_length": 856.0, + "completions/min_terminated_length": 856.0, + "epoch": 0.5638629283489096, + "grad_norm": 0.6121560335159302, + "kl": 0.046877965331077576, + "learning_rate": 1.7806249999999998e-06, + "loss": -0.0113, + "num_tokens": 46607247.0, + "reward": 1.4698020219802856, + "reward_std": 0.09116669744253159, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.48170679807662964, + "rewards/correct_reward_func/std": 0.1016574278473854, + "step": 362 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2040.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1318.9881591796875, + "completions/mean_terminated_length": 1318.9881591796875, + "completions/min_length": 765.0, + "completions/min_terminated_length": 765.0, + "epoch": 0.5654205607476636, + "grad_norm": 0.6227669715881348, + "kl": 0.046381985768675804, + "learning_rate": 1.78e-06, + "loss": -0.0136, + "num_tokens": 46724030.0, + "reward": 1.4805115461349487, + "reward_std": 0.1278102546930313, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5043209791183472, + "rewards/correct_reward_func/std": 0.19428442418575287, + "step": 363 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2555.0, + "completions/max_terminated_length": 2555.0, + "completions/mean_length": 1378.107177734375, + "completions/mean_terminated_length": 1378.107177734375, + "completions/min_length": 933.0, + "completions/min_terminated_length": 933.0, + "epoch": 0.5669781931464174, + "grad_norm": 0.6208792328834534, + "kl": 0.048077501356601715, + "learning_rate": 1.7793749999999998e-06, + "loss": 0.015, + "num_tokens": 46845689.0, + "reward": 1.4393149614334106, + "reward_std": 0.06981474906206131, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43931499123573303, + "rewards/correct_reward_func/std": 0.13650043308734894, + "step": 364 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 1724.0, + "completions/mean_length": 1391.65478515625, + "completions/mean_terminated_length": 1309.722900390625, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.5685358255451713, + "grad_norm": 0.624458909034729, + "kl": 0.04737947881221771, + "learning_rate": 1.77875e-06, + "loss": 0.0523, + "num_tokens": 46968504.0, + "reward": 1.3942302465438843, + "reward_std": 0.11320418864488602, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.41803956031799316, + "rewards/correct_reward_func/std": 0.14469270408153534, + "step": 365 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 1959.0, + "completions/mean_length": 1453.6905517578125, + "completions/mean_terminated_length": 1372.5059814453125, + "completions/min_length": 908.0, + "completions/min_terminated_length": 908.0, + "epoch": 0.5700934579439252, + "grad_norm": 0.5907891988754272, + "kl": 0.04476970434188843, + "learning_rate": 1.7781249999999998e-06, + "loss": 0.0414, + "num_tokens": 47096638.0, + "reward": 1.4558826684951782, + "reward_std": 0.08813583105802536, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4558826684951782, + "rewards/correct_reward_func/std": 0.1586223989725113, + "step": 366 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2085.0, + "completions/max_terminated_length": 2085.0, + "completions/mean_length": 1346.416748046875, + "completions/mean_terminated_length": 1346.416748046875, + "completions/min_length": 661.0, + "completions/min_terminated_length": 661.0, + "epoch": 0.5716510903426791, + "grad_norm": 0.6247698068618774, + "kl": 0.04782709293067455, + "learning_rate": 1.7775e-06, + "loss": 0.0158, + "num_tokens": 47215659.0, + "reward": 1.4506251811981201, + "reward_std": 0.061412323266267776, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4506250023841858, + "rewards/correct_reward_func/std": 0.1287914216518402, + "step": 367 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2000.0, + "completions/max_terminated_length": 2000.0, + "completions/mean_length": 1385.511962890625, + "completions/mean_terminated_length": 1385.511962890625, + "completions/min_length": 900.0, + "completions/min_terminated_length": 900.0, + "epoch": 0.573208722741433, + "grad_norm": 0.5953041911125183, + "kl": 0.04662996344268322, + "learning_rate": 1.7768749999999998e-06, + "loss": -0.0291, + "num_tokens": 47338246.0, + "reward": 1.61995267868042, + "reward_std": 0.071071557700634, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.6199524998664856, + "rewards/correct_reward_func/std": 0.15339720249176025, + "step": 368 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1815.0, + "completions/max_terminated_length": 1815.0, + "completions/mean_length": 1350.6905517578125, + "completions/mean_terminated_length": 1350.6905517578125, + "completions/min_length": 880.0, + "completions/min_terminated_length": 880.0, + "epoch": 0.5747663551401869, + "grad_norm": 0.596235990524292, + "kl": 0.047611601650714874, + "learning_rate": 1.77625e-06, + "loss": 0.0142, + "num_tokens": 47457746.0, + "reward": 1.4904042482376099, + "reward_std": 0.0895150825381279, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5023089647293091, + "rewards/correct_reward_func/std": 0.10198992490768433, + "step": 369 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2089.0, + "completions/max_terminated_length": 2089.0, + "completions/mean_length": 1335.142822265625, + "completions/mean_terminated_length": 1335.142822265625, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "epoch": 0.5763239875389408, + "grad_norm": 0.6384495496749878, + "kl": 0.04769720509648323, + "learning_rate": 1.7756249999999998e-06, + "loss": 0.0225, + "num_tokens": 47575748.0, + "reward": 1.4142802953720093, + "reward_std": 0.08743462711572647, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4261849820613861, + "rewards/correct_reward_func/std": 0.12853728234767914, + "step": 370 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1798.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 1369.0595703125, + "completions/mean_terminated_length": 1369.0595703125, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "epoch": 0.5778816199376947, + "grad_norm": 0.5966677069664001, + "kl": 0.04758315160870552, + "learning_rate": 1.7749999999999997e-06, + "loss": 0.0182, + "num_tokens": 47696935.0, + "reward": 1.4828351736068726, + "reward_std": 0.06344291567802429, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.482835054397583, + "rewards/correct_reward_func/std": 0.13246676325798035, + "step": 371 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1849.0, + "completions/max_terminated_length": 1849.0, + "completions/mean_length": 1291.5714111328125, + "completions/mean_terminated_length": 1291.5714111328125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.5794392523364486, + "grad_norm": 0.6525982618331909, + "kl": 0.049696190282702446, + "learning_rate": 1.774375e-06, + "loss": -0.0193, + "num_tokens": 47811241.0, + "reward": 1.4084582328796387, + "reward_std": 0.06577665358781815, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.40845808386802673, + "rewards/correct_reward_func/std": 0.11660967767238617, + "step": 372 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2206.0, + "completions/mean_length": 1518.5595703125, + "completions/mean_terminated_length": 1438.1566162109375, + "completions/min_length": 864.0, + "completions/min_terminated_length": 864.0, + "epoch": 0.5809968847352025, + "grad_norm": 0.5672593116760254, + "kl": 0.04501592554152012, + "learning_rate": 1.77375e-06, + "loss": 0.0895, + "num_tokens": 47944788.0, + "reward": 1.5155577659606934, + "reward_std": 0.06096799299120903, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5155577659606934, + "rewards/correct_reward_func/std": 0.16944406926631927, + "step": 373 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2131.0, + "completions/max_terminated_length": 2131.0, + "completions/mean_length": 1468.6309814453125, + "completions/mean_terminated_length": 1468.6309814453125, + "completions/min_length": 833.0, + "completions/min_terminated_length": 833.0, + "epoch": 0.5825545171339563, + "grad_norm": 0.5926371216773987, + "kl": 0.04884720593690872, + "learning_rate": 1.773125e-06, + "loss": 0.0291, + "num_tokens": 48074309.0, + "reward": 1.5413291454315186, + "reward_std": 0.074510857462883, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5413291454315186, + "rewards/correct_reward_func/std": 0.15059438347816467, + "step": 374 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1955.0, + "completions/max_terminated_length": 1955.0, + "completions/mean_length": 1333.75, + "completions/mean_terminated_length": 1333.75, + "completions/min_length": 892.0, + "completions/min_terminated_length": 892.0, + "epoch": 0.5841121495327103, + "grad_norm": 0.6363287568092346, + "kl": 0.04845425486564636, + "learning_rate": 1.7725e-06, + "loss": -0.0013, + "num_tokens": 48192344.0, + "reward": 1.4435120820999146, + "reward_std": 0.09547659754753113, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.455416738986969, + "rewards/correct_reward_func/std": 0.17547385394573212, + "step": 375 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1829.0, + "completions/max_terminated_length": 1829.0, + "completions/mean_length": 1348.09521484375, + "completions/mean_terminated_length": 1348.09521484375, + "completions/min_length": 905.0, + "completions/min_terminated_length": 905.0, + "epoch": 0.5856697819314641, + "grad_norm": 0.6347678303718567, + "kl": 0.04872422479093075, + "learning_rate": 1.771875e-06, + "loss": -0.0063, + "num_tokens": 48311446.0, + "reward": 1.532442569732666, + "reward_std": 0.08060499280691147, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5324423909187317, + "rewards/correct_reward_func/std": 0.1389501988887787, + "step": 376 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2040.0, + "completions/max_terminated_length": 2040.0, + "completions/mean_length": 1335.6905517578125, + "completions/mean_terminated_length": 1335.6905517578125, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.5872274143302181, + "grad_norm": 0.6710498929023743, + "kl": 0.04991703853011131, + "learning_rate": 1.77125e-06, + "loss": -0.0097, + "num_tokens": 48429530.0, + "reward": 1.4852207899093628, + "reward_std": 0.11480290442705154, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5090302228927612, + "rewards/correct_reward_func/std": 0.18953540921211243, + "step": 377 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2115.0, + "completions/max_terminated_length": 2115.0, + "completions/mean_length": 1367.1190185546875, + "completions/mean_terminated_length": 1367.1190185546875, + "completions/min_length": 798.0, + "completions/min_terminated_length": 798.0, + "epoch": 0.5887850467289719, + "grad_norm": 0.5975840091705322, + "kl": 0.04870462976396084, + "learning_rate": 1.7706249999999999e-06, + "loss": 0.0098, + "num_tokens": 48550482.0, + "reward": 1.4639222621917725, + "reward_std": 0.09818486869335175, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.47582679986953735, + "rewards/correct_reward_func/std": 0.15385620296001434, + "step": 378 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2593.0, + "completions/max_terminated_length": 2593.0, + "completions/mean_length": 1391.6190185546875, + "completions/mean_terminated_length": 1391.6190185546875, + "completions/min_length": 927.0, + "completions/min_terminated_length": 927.0, + "epoch": 0.5903426791277259, + "grad_norm": 0.5850672721862793, + "kl": 0.04981931112706661, + "learning_rate": 1.77e-06, + "loss": -0.0048, + "num_tokens": 48673492.0, + "reward": 1.4705395698547363, + "reward_std": 0.07982930541038513, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4705394506454468, + "rewards/correct_reward_func/std": 0.1553632766008377, + "step": 379 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1856.0, + "completions/max_terminated_length": 1856.0, + "completions/mean_length": 1282.952392578125, + "completions/mean_terminated_length": 1282.952392578125, + "completions/min_length": 857.0, + "completions/min_terminated_length": 857.0, + "epoch": 0.5919003115264797, + "grad_norm": 0.6228066682815552, + "kl": 0.048284122720360756, + "learning_rate": 1.769375e-06, + "loss": -0.0227, + "num_tokens": 48786996.0, + "reward": 1.436045527458191, + "reward_std": 0.05097164586186409, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4360455572605133, + "rewards/correct_reward_func/std": 0.12165073305368423, + "step": 380 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2251.0, + "completions/max_terminated_length": 2251.0, + "completions/mean_length": 1336.7738037109375, + "completions/mean_terminated_length": 1336.7738037109375, + "completions/min_length": 766.0, + "completions/min_terminated_length": 766.0, + "epoch": 0.5934579439252337, + "grad_norm": 0.6182482838630676, + "kl": 0.047869689762592316, + "learning_rate": 1.76875e-06, + "loss": -0.0094, + "num_tokens": 48905447.0, + "reward": 1.4301729202270508, + "reward_std": 0.08307760953903198, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44207748770713806, + "rewards/correct_reward_func/std": 0.16730858385562897, + "step": 381 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2161.0, + "completions/max_terminated_length": 2161.0, + "completions/mean_length": 1325.9881591796875, + "completions/mean_terminated_length": 1325.9881591796875, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "epoch": 0.5950155763239875, + "grad_norm": 0.6429280638694763, + "kl": 0.0489403922110796, + "learning_rate": 1.768125e-06, + "loss": -0.0063, + "num_tokens": 49022764.0, + "reward": 1.4742978811264038, + "reward_std": 0.04766622185707092, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47429779171943665, + "rewards/correct_reward_func/std": 0.11626514792442322, + "step": 382 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2004.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1350.09521484375, + "completions/mean_terminated_length": 1350.09521484375, + "completions/min_length": 694.0, + "completions/min_terminated_length": 694.0, + "epoch": 0.5965732087227414, + "grad_norm": 0.6260592341423035, + "kl": 0.05001649633049965, + "learning_rate": 1.7675e-06, + "loss": 0.0078, + "num_tokens": 49142112.0, + "reward": 1.5085965394973755, + "reward_std": 0.06493545323610306, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5085963606834412, + "rewards/correct_reward_func/std": 0.12024178355932236, + "step": 383 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2039.0, + "completions/max_terminated_length": 2039.0, + "completions/mean_length": 1421.166748046875, + "completions/mean_terminated_length": 1421.166748046875, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "epoch": 0.5981308411214953, + "grad_norm": 0.577016294002533, + "kl": 0.05048423446714878, + "learning_rate": 1.766875e-06, + "loss": 0.0171, + "num_tokens": 49267316.0, + "reward": 1.4990577697753906, + "reward_std": 0.10350355505943298, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5109624266624451, + "rewards/correct_reward_func/std": 0.17975008487701416, + "step": 384 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2037.0, + "completions/max_terminated_length": 2037.0, + "completions/mean_length": 1388.6190185546875, + "completions/mean_terminated_length": 1388.6190185546875, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "epoch": 0.5996884735202492, + "grad_norm": 0.6296937465667725, + "kl": 0.048955587670207024, + "learning_rate": 1.76625e-06, + "loss": 0.0216, + "num_tokens": 49390212.0, + "reward": 1.4979599714279175, + "reward_std": 0.09178230166435242, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5098646283149719, + "rewards/correct_reward_func/std": 0.16648715734481812, + "step": 385 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1842.0, + "completions/max_terminated_length": 1842.0, + "completions/mean_length": 1338.8214111328125, + "completions/mean_terminated_length": 1338.8214111328125, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "epoch": 0.6012461059190031, + "grad_norm": 0.5855390429496765, + "kl": 0.04758539795875549, + "learning_rate": 1.765625e-06, + "loss": -0.0172, + "num_tokens": 49508787.0, + "reward": 1.4601309299468994, + "reward_std": 0.04890606552362442, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46013087034225464, + "rewards/correct_reward_func/std": 0.1410851925611496, + "step": 386 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 1426.46435546875, + "completions/mean_terminated_length": 1344.9517822265625, + "completions/min_length": 233.0, + "completions/min_terminated_length": 233.0, + "epoch": 0.602803738317757, + "grad_norm": 0.5801010131835938, + "kl": 0.04862123541533947, + "learning_rate": 1.7649999999999998e-06, + "loss": 0.033, + "num_tokens": 49634694.0, + "reward": 1.5094380378723145, + "reward_std": 0.07422788441181183, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5094379186630249, + "rewards/correct_reward_func/std": 0.18413080275058746, + "step": 387 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1957.0, + "completions/max_terminated_length": 1957.0, + "completions/mean_length": 1389.4761962890625, + "completions/mean_terminated_length": 1389.4761962890625, + "completions/min_length": 806.0, + "completions/min_terminated_length": 806.0, + "epoch": 0.6043613707165109, + "grad_norm": 0.6577509641647339, + "kl": 0.050652796402573586, + "learning_rate": 1.764375e-06, + "loss": 0.0029, + "num_tokens": 49757398.0, + "reward": 1.5519939661026, + "reward_std": 0.06602007895708084, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5519937872886658, + "rewards/correct_reward_func/std": 0.12210499495267868, + "step": 388 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1984.0, + "completions/max_terminated_length": 1984.0, + "completions/mean_length": 1296.1785888671875, + "completions/mean_terminated_length": 1296.1785888671875, + "completions/min_length": 750.0, + "completions/min_terminated_length": 750.0, + "epoch": 0.6059190031152648, + "grad_norm": 0.6384318470954895, + "kl": 0.04982003942131996, + "learning_rate": 1.7637499999999998e-06, + "loss": -0.0317, + "num_tokens": 49872283.0, + "reward": 1.474208116531372, + "reward_std": 0.056270867586135864, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4742080271244049, + "rewards/correct_reward_func/std": 0.1628669947385788, + "step": 389 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2054.0, + "completions/max_terminated_length": 2054.0, + "completions/mean_length": 1370.7738037109375, + "completions/mean_terminated_length": 1370.7738037109375, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "epoch": 0.6074766355140186, + "grad_norm": 0.6070489287376404, + "kl": 0.04989171586930752, + "learning_rate": 1.763125e-06, + "loss": 0.0072, + "num_tokens": 49993458.0, + "reward": 1.4242897033691406, + "reward_std": 0.12058194726705551, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4361944794654846, + "rewards/correct_reward_func/std": 0.15444742143154144, + "step": 390 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1930.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1269.6190185546875, + "completions/mean_terminated_length": 1269.6190185546875, + "completions/min_length": 758.0, + "completions/min_terminated_length": 758.0, + "epoch": 0.6090342679127726, + "grad_norm": 0.6277110576629639, + "kl": 0.05084827356040478, + "learning_rate": 1.7624999999999999e-06, + "loss": -0.0106, + "num_tokens": 50105788.0, + "reward": 1.4761323928833008, + "reward_std": 0.08773455768823624, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47613224387168884, + "rewards/correct_reward_func/std": 0.18977254629135132, + "step": 391 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2028.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1340.547607421875, + "completions/mean_terminated_length": 1340.547607421875, + "completions/min_length": 660.0, + "completions/min_terminated_length": 660.0, + "epoch": 0.6105919003115264, + "grad_norm": 0.6418157815933228, + "kl": 0.05028718709945679, + "learning_rate": 1.761875e-06, + "loss": 0.0052, + "num_tokens": 50224304.0, + "reward": 1.5189129114151, + "reward_std": 0.06859312951564789, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5189128518104553, + "rewards/correct_reward_func/std": 0.13187937438488007, + "step": 392 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 1348.2857666015625, + "completions/mean_terminated_length": 1348.2857666015625, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "epoch": 0.6121495327102804, + "grad_norm": 0.6196921467781067, + "kl": 0.05028197728097439, + "learning_rate": 1.7612499999999999e-06, + "loss": -0.0127, + "num_tokens": 50343602.0, + "reward": 1.4608813524246216, + "reward_std": 0.0647798702120781, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4608812928199768, + "rewards/correct_reward_func/std": 0.10814743489027023, + "step": 393 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 1874.0, + "completions/mean_length": 1432.761962890625, + "completions/mean_terminated_length": 1351.3251953125, + "completions/min_length": 841.0, + "completions/min_terminated_length": 841.0, + "epoch": 0.6137071651090342, + "grad_norm": 0.5915994048118591, + "kl": 0.04747145250439644, + "learning_rate": 1.760625e-06, + "loss": 0.0774, + "num_tokens": 50470044.0, + "reward": 1.4953646659851074, + "reward_std": 0.06458833068609238, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49536460638046265, + "rewards/correct_reward_func/std": 0.15716253221035004, + "step": 394 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2262.0, + "completions/max_terminated_length": 2262.0, + "completions/mean_length": 1341.4761962890625, + "completions/mean_terminated_length": 1341.4761962890625, + "completions/min_length": 772.0, + "completions/min_terminated_length": 772.0, + "epoch": 0.6152647975077882, + "grad_norm": 0.6374837756156921, + "kl": 0.05029851756989956, + "learning_rate": 1.7599999999999999e-06, + "loss": -0.0057, + "num_tokens": 50588620.0, + "reward": 1.4544249773025513, + "reward_std": 0.06668942421674728, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4544249176979065, + "rewards/correct_reward_func/std": 0.1269298493862152, + "step": 395 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2396.0, + "completions/max_terminated_length": 2396.0, + "completions/mean_length": 1336.6785888671875, + "completions/mean_terminated_length": 1336.6785888671875, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "epoch": 0.616822429906542, + "grad_norm": 0.6182777881622314, + "kl": 0.04916258528828621, + "learning_rate": 1.7593749999999998e-06, + "loss": -0.0244, + "num_tokens": 50706799.0, + "reward": 1.5116595029830933, + "reward_std": 0.056161068379879, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.511659562587738, + "rewards/correct_reward_func/std": 0.17195159196853638, + "step": 396 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2482.0, + "completions/max_terminated_length": 2482.0, + "completions/mean_length": 1375.8690185546875, + "completions/mean_terminated_length": 1375.8690185546875, + "completions/min_length": 937.0, + "completions/min_terminated_length": 937.0, + "epoch": 0.618380062305296, + "grad_norm": 0.60200434923172, + "kl": 0.049690814688801765, + "learning_rate": 1.7587499999999999e-06, + "loss": -0.0061, + "num_tokens": 50828270.0, + "reward": 1.492004156112671, + "reward_std": 0.07065374404191971, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4920039772987366, + "rewards/correct_reward_func/std": 0.14335261285305023, + "step": 397 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2264.0, + "completions/mean_length": 1516.9761962890625, + "completions/mean_terminated_length": 1436.55419921875, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "epoch": 0.6199376947040498, + "grad_norm": 0.587050199508667, + "kl": 0.048227181658148766, + "learning_rate": 1.7581249999999998e-06, + "loss": 0.0757, + "num_tokens": 50961774.0, + "reward": 1.5450599193572998, + "reward_std": 0.09599590301513672, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.556964635848999, + "rewards/correct_reward_func/std": 0.1692476123571396, + "step": 398 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2190.0, + "completions/max_terminated_length": 2190.0, + "completions/mean_length": 1380.5357666015625, + "completions/mean_terminated_length": 1380.5357666015625, + "completions/min_length": 816.0, + "completions/min_terminated_length": 816.0, + "epoch": 0.6214953271028038, + "grad_norm": 0.550334095954895, + "kl": 0.04830704443156719, + "learning_rate": 1.7575e-06, + "loss": -0.0391, + "num_tokens": 51083847.0, + "reward": 1.4870414733886719, + "reward_std": 0.06050838157534599, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48704153299331665, + "rewards/correct_reward_func/std": 0.13304997980594635, + "step": 399 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2251.0, + "completions/max_terminated_length": 2251.0, + "completions/mean_length": 1446.96435546875, + "completions/mean_terminated_length": 1446.96435546875, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "epoch": 0.6230529595015576, + "grad_norm": 0.6399713754653931, + "kl": 0.04986717738211155, + "learning_rate": 1.7568749999999998e-06, + "loss": -0.0089, + "num_tokens": 51211506.0, + "reward": 1.505528211593628, + "reward_std": 0.057832684367895126, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5055281519889832, + "rewards/correct_reward_func/std": 0.18946535885334015, + "step": 400 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2223.0, + "completions/max_terminated_length": 2223.0, + "completions/mean_length": 1390.4285888671875, + "completions/mean_terminated_length": 1390.4285888671875, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "epoch": 0.6246105919003115, + "grad_norm": 0.6153919100761414, + "kl": 0.048483846709132195, + "learning_rate": 1.75625e-06, + "loss": 0.0004, + "num_tokens": 51334536.0, + "reward": 1.5144027471542358, + "reward_std": 0.046569447964429855, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5144026875495911, + "rewards/correct_reward_func/std": 0.10261337459087372, + "step": 401 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2250.0, + "completions/mean_length": 1555.4761962890625, + "completions/mean_terminated_length": 1475.51806640625, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "epoch": 0.6261682242990654, + "grad_norm": 0.5729800462722778, + "kl": 0.04777614213526249, + "learning_rate": 1.7556249999999998e-06, + "loss": 0.0621, + "num_tokens": 51471334.0, + "reward": 1.477668285369873, + "reward_std": 0.08024942129850388, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47766822576522827, + "rewards/correct_reward_func/std": 0.12405380606651306, + "step": 402 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2004.0, + "completions/max_terminated_length": 2004.0, + "completions/mean_length": 1394.0714111328125, + "completions/mean_terminated_length": 1394.0714111328125, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "epoch": 0.6277258566978193, + "grad_norm": 0.6006665229797363, + "kl": 0.048750247806310654, + "learning_rate": 1.7549999999999997e-06, + "loss": 0.0141, + "num_tokens": 51594508.0, + "reward": 1.5268176794052124, + "reward_std": 0.06284648180007935, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5268176794052124, + "rewards/correct_reward_func/std": 0.1302812248468399, + "step": 403 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2047.0, + "completions/max_terminated_length": 2047.0, + "completions/mean_length": 1379.416748046875, + "completions/mean_terminated_length": 1379.416748046875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.6292834890965732, + "grad_norm": 0.6163754463195801, + "kl": 0.0525053720921278, + "learning_rate": 1.754375e-06, + "loss": -0.0313, + "num_tokens": 51716115.0, + "reward": 1.5108131170272827, + "reward_std": 0.09598790854215622, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5108129382133484, + "rewards/correct_reward_func/std": 0.17914439737796783, + "step": 404 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2625.0, + "completions/max_terminated_length": 2625.0, + "completions/mean_length": 1451.5595703125, + "completions/mean_terminated_length": 1451.5595703125, + "completions/min_length": 506.0, + "completions/min_terminated_length": 506.0, + "epoch": 0.6308411214953271, + "grad_norm": 0.6257968544960022, + "kl": 0.0501710157841444, + "learning_rate": 1.75375e-06, + "loss": -0.0006, + "num_tokens": 51843950.0, + "reward": 1.4702588319778442, + "reward_std": 0.08658844977617264, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.48216357827186584, + "rewards/correct_reward_func/std": 0.13416936993598938, + "step": 405 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2347.0, + "completions/max_terminated_length": 2347.0, + "completions/mean_length": 1460.0, + "completions/mean_terminated_length": 1460.0, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "epoch": 0.632398753894081, + "grad_norm": 0.5904620885848999, + "kl": 0.049389807507395744, + "learning_rate": 1.753125e-06, + "loss": -0.0261, + "num_tokens": 51972656.0, + "reward": 1.5546208620071411, + "reward_std": 0.06326793879270554, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5546208024024963, + "rewards/correct_reward_func/std": 0.1749102622270584, + "step": 406 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3438.0, + "completions/max_terminated_length": 3438.0, + "completions/mean_length": 1483.4761962890625, + "completions/mean_terminated_length": 1483.4761962890625, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "epoch": 0.6339563862928349, + "grad_norm": 0.5588891506195068, + "kl": 0.04781218431890011, + "learning_rate": 1.7525e-06, + "loss": 0.0121, + "num_tokens": 52103292.0, + "reward": 1.540654182434082, + "reward_std": 0.06845831125974655, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5406539440155029, + "rewards/correct_reward_func/std": 0.22292810678482056, + "step": 407 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2493.0, + "completions/max_terminated_length": 2493.0, + "completions/mean_length": 1564.34521484375, + "completions/mean_terminated_length": 1564.34521484375, + "completions/min_length": 724.0, + "completions/min_terminated_length": 724.0, + "epoch": 0.6355140186915887, + "grad_norm": 0.568066418170929, + "kl": 0.050099100917577744, + "learning_rate": 1.751875e-06, + "loss": 0.0297, + "num_tokens": 52240961.0, + "reward": 1.460665225982666, + "reward_std": 0.04765839874744415, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46066510677337646, + "rewards/correct_reward_func/std": 0.09499367326498032, + "step": 408 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2255.0, + "completions/max_terminated_length": 2255.0, + "completions/mean_length": 1502.107177734375, + "completions/mean_terminated_length": 1502.107177734375, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "epoch": 0.6370716510903427, + "grad_norm": 0.6036926507949829, + "kl": 0.049950817599892616, + "learning_rate": 1.75125e-06, + "loss": -0.0168, + "num_tokens": 52373144.0, + "reward": 1.4840716123580933, + "reward_std": 0.09531796723604202, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4959762990474701, + "rewards/correct_reward_func/std": 0.1371496170759201, + "step": 409 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.04761904761904767, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2686.0, + "completions/mean_length": 1858.1905517578125, + "completions/mean_terminated_length": 1541.5, + "completions/min_length": 838.0, + "completions/min_terminated_length": 838.0, + "epoch": 0.6386292834890965, + "grad_norm": 0.498012512922287, + "kl": 0.04354623891413212, + "learning_rate": 1.750625e-06, + "loss": 0.2021, + "num_tokens": 52535262.0, + "reward": 1.4536290168762207, + "reward_std": 0.09940145164728165, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4536289870738983, + "rewards/correct_reward_func/std": 0.18348245322704315, + "step": 410 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3503.0, + "completions/max_terminated_length": 3503.0, + "completions/mean_length": 1549.952392578125, + "completions/mean_terminated_length": 1549.952392578125, + "completions/min_length": 981.0, + "completions/min_terminated_length": 981.0, + "epoch": 0.6401869158878505, + "grad_norm": 0.5648190379142761, + "kl": 0.048112260177731514, + "learning_rate": 1.75e-06, + "loss": 0.004, + "num_tokens": 52671566.0, + "reward": 1.4433013200759888, + "reward_std": 0.09281626343727112, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.45520591735839844, + "rewards/correct_reward_func/std": 0.1479065865278244, + "step": 411 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2165.0, + "completions/max_terminated_length": 2165.0, + "completions/mean_length": 1474.416748046875, + "completions/mean_terminated_length": 1474.416748046875, + "completions/min_length": 852.0, + "completions/min_terminated_length": 852.0, + "epoch": 0.6417445482866043, + "grad_norm": 0.5981578826904297, + "kl": 0.05129780061542988, + "learning_rate": 1.7493749999999999e-06, + "loss": 0.0015, + "num_tokens": 52801327.0, + "reward": 1.5297049283981323, + "reward_std": 0.07844500243663788, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5297048687934875, + "rewards/correct_reward_func/std": 0.17416192591190338, + "step": 412 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2826.0, + "completions/max_terminated_length": 2826.0, + "completions/mean_length": 1552.8214111328125, + "completions/mean_terminated_length": 1552.8214111328125, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "epoch": 0.6433021806853583, + "grad_norm": 0.551489531993866, + "kl": 0.04923750273883343, + "learning_rate": 1.74875e-06, + "loss": -0.005, + "num_tokens": 52937830.0, + "reward": 1.544388771057129, + "reward_std": 0.07821746915578842, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5562934875488281, + "rewards/correct_reward_func/std": 0.155470609664917, + "step": 413 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2580.0, + "completions/max_terminated_length": 2580.0, + "completions/mean_length": 1587.2857666015625, + "completions/mean_terminated_length": 1587.2857666015625, + "completions/min_length": 1037.0, + "completions/min_terminated_length": 1037.0, + "epoch": 0.6448598130841121, + "grad_norm": 0.5425035953521729, + "kl": 0.049748532474040985, + "learning_rate": 1.7481249999999999e-06, + "loss": -0.0384, + "num_tokens": 53077198.0, + "reward": 1.522072434425354, + "reward_std": 0.10684633255004883, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5339770317077637, + "rewards/correct_reward_func/std": 0.17637047171592712, + "step": 414 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2465.0, + "completions/mean_length": 1585.9405517578125, + "completions/mean_terminated_length": 1506.349365234375, + "completions/min_length": 975.0, + "completions/min_terminated_length": 975.0, + "epoch": 0.6464174454828661, + "grad_norm": 0.5822159647941589, + "kl": 0.04930712282657623, + "learning_rate": 1.7475e-06, + "loss": 0.0505, + "num_tokens": 53216147.0, + "reward": 1.520836591720581, + "reward_std": 0.09006838500499725, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5208365321159363, + "rewards/correct_reward_func/std": 0.19083261489868164, + "step": 415 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2523.0, + "completions/mean_length": 1632.4405517578125, + "completions/mean_terminated_length": 1553.4095458984375, + "completions/min_length": 890.0, + "completions/min_terminated_length": 890.0, + "epoch": 0.6479750778816199, + "grad_norm": 0.5585830211639404, + "kl": 0.04699916951358318, + "learning_rate": 1.746875e-06, + "loss": -0.0579, + "num_tokens": 53359182.0, + "reward": 1.4239436388015747, + "reward_std": 0.05681487172842026, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.42394357919692993, + "rewards/correct_reward_func/std": 0.12871341407299042, + "step": 416 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3221.0, + "completions/max_terminated_length": 3221.0, + "completions/mean_length": 1636.5357666015625, + "completions/mean_terminated_length": 1636.5357666015625, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "epoch": 0.6495327102803738, + "grad_norm": 0.6002727150917053, + "kl": 0.05007455497980118, + "learning_rate": 1.74625e-06, + "loss": 0.0327, + "num_tokens": 53502591.0, + "reward": 1.4320467710494995, + "reward_std": 0.05887793377041817, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4320466220378876, + "rewards/correct_reward_func/std": 0.11791915446519852, + "step": 417 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2371.0, + "completions/mean_length": 1614.452392578125, + "completions/mean_terminated_length": 1535.2047119140625, + "completions/min_length": 796.0, + "completions/min_terminated_length": 796.0, + "epoch": 0.6510903426791277, + "grad_norm": 0.601240873336792, + "kl": 0.051675185561180115, + "learning_rate": 1.745625e-06, + "loss": 0.0749, + "num_tokens": 53644247.0, + "reward": 1.486029028892517, + "reward_std": 0.07044512033462524, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48602885007858276, + "rewards/correct_reward_func/std": 0.1468798667192459, + "step": 418 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3119.0, + "completions/max_terminated_length": 3119.0, + "completions/mean_length": 1591.75, + "completions/mean_terminated_length": 1591.75, + "completions/min_length": 1031.0, + "completions/min_terminated_length": 1031.0, + "epoch": 0.6526479750778816, + "grad_norm": 0.6253805756568909, + "kl": 0.051795635372400284, + "learning_rate": 1.745e-06, + "loss": -0.0031, + "num_tokens": 53783720.0, + "reward": 1.4498029947280884, + "reward_std": 0.08419051766395569, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46170762181282043, + "rewards/correct_reward_func/std": 0.17074517905712128, + "step": 419 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2441.0, + "completions/max_terminated_length": 2441.0, + "completions/mean_length": 1521.8214111328125, + "completions/mean_terminated_length": 1521.8214111328125, + "completions/min_length": 928.0, + "completions/min_terminated_length": 928.0, + "epoch": 0.6542056074766355, + "grad_norm": 0.6050965785980225, + "kl": 0.05233046971261501, + "learning_rate": 1.744375e-06, + "loss": -0.019, + "num_tokens": 53917445.0, + "reward": 1.435978651046753, + "reward_std": 0.12028548866510391, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.459788054227829, + "rewards/correct_reward_func/std": 0.14839471876621246, + "step": 420 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2524.0, + "completions/mean_length": 1687.952392578125, + "completions/mean_terminated_length": 1609.59033203125, + "completions/min_length": 913.0, + "completions/min_terminated_length": 913.0, + "epoch": 0.6557632398753894, + "grad_norm": 0.542452871799469, + "kl": 0.05065176263451576, + "learning_rate": 1.7437499999999998e-06, + "loss": 0.0578, + "num_tokens": 54065257.0, + "reward": 1.4500166177749634, + "reward_std": 0.09093791991472244, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4500165581703186, + "rewards/correct_reward_func/std": 0.16098248958587646, + "step": 421 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2327.0, + "completions/max_terminated_length": 2327.0, + "completions/mean_length": 1550.5714111328125, + "completions/mean_terminated_length": 1550.5714111328125, + "completions/min_length": 969.0, + "completions/min_terminated_length": 969.0, + "epoch": 0.6573208722741433, + "grad_norm": 0.5730518102645874, + "kl": 0.050289461389184, + "learning_rate": 1.743125e-06, + "loss": -0.0254, + "num_tokens": 54201613.0, + "reward": 1.4041987657546997, + "reward_std": 0.046286944299936295, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4041987359523773, + "rewards/correct_reward_func/std": 0.13344568014144897, + "step": 422 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2280.0, + "completions/mean_length": 1631.297607421875, + "completions/mean_terminated_length": 1552.2529296875, + "completions/min_length": 804.0, + "completions/min_terminated_length": 804.0, + "epoch": 0.6588785046728972, + "grad_norm": 0.5699371099472046, + "kl": 0.05113396793603897, + "learning_rate": 1.7424999999999998e-06, + "loss": 0.0358, + "num_tokens": 54344702.0, + "reward": 1.4777201414108276, + "reward_std": 0.07488631457090378, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47772011160850525, + "rewards/correct_reward_func/std": 0.13865311443805695, + "step": 423 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2303.0, + "completions/mean_length": 1677.59521484375, + "completions/mean_terminated_length": 1599.1083984375, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "epoch": 0.660436137071651, + "grad_norm": 0.5397751927375793, + "kl": 0.05114184692502022, + "learning_rate": 1.741875e-06, + "loss": -0.0164, + "num_tokens": 54491662.0, + "reward": 1.4992514848709106, + "reward_std": 0.0635334923863411, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4992513060569763, + "rewards/correct_reward_func/std": 0.1298021823167801, + "step": 424 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2204.0, + "completions/max_terminated_length": 2204.0, + "completions/mean_length": 1552.1429443359375, + "completions/mean_terminated_length": 1552.1429443359375, + "completions/min_length": 1036.0, + "completions/min_terminated_length": 1036.0, + "epoch": 0.661993769470405, + "grad_norm": 0.5741438269615173, + "kl": 0.05266575887799263, + "learning_rate": 1.7412499999999998e-06, + "loss": 0.0127, + "num_tokens": 54627778.0, + "reward": 1.4440009593963623, + "reward_std": 0.08349818736314774, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.45590564608573914, + "rewards/correct_reward_func/std": 0.17922662198543549, + "step": 425 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2499.0, + "completions/max_terminated_length": 2499.0, + "completions/mean_length": 1625.3929443359375, + "completions/mean_terminated_length": 1625.3929443359375, + "completions/min_length": 981.0, + "completions/min_terminated_length": 981.0, + "epoch": 0.6635514018691588, + "grad_norm": 0.5885212421417236, + "kl": 0.05119376443326473, + "learning_rate": 1.740625e-06, + "loss": 0.0118, + "num_tokens": 54770425.0, + "reward": 1.473638653755188, + "reward_std": 0.0765259712934494, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4736386239528656, + "rewards/correct_reward_func/std": 0.10725454986095428, + "step": 426 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2610.0, + "completions/mean_length": 1690.8333740234375, + "completions/mean_terminated_length": 1612.5059814453125, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "epoch": 0.6651090342679128, + "grad_norm": 0.5927914381027222, + "kl": 0.04985920339822769, + "learning_rate": 1.7399999999999999e-06, + "loss": 0.0531, + "num_tokens": 54918389.0, + "reward": 1.4642417430877686, + "reward_std": 0.12046536058187485, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4761464297771454, + "rewards/correct_reward_func/std": 0.1584354043006897, + "step": 427 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2421.0, + "completions/max_terminated_length": 2421.0, + "completions/mean_length": 1637.1190185546875, + "completions/mean_terminated_length": 1637.1190185546875, + "completions/min_length": 1066.0, + "completions/min_terminated_length": 1066.0, + "epoch": 0.6666666666666666, + "grad_norm": 0.5880814790725708, + "kl": 0.05253339186310768, + "learning_rate": 1.7393749999999998e-06, + "loss": -0.0262, + "num_tokens": 55061943.0, + "reward": 1.5044912099838257, + "reward_std": 0.08261405676603317, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5044911503791809, + "rewards/correct_reward_func/std": 0.17851826548576355, + "step": 428 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3001.0, + "completions/mean_length": 1644.8929443359375, + "completions/mean_terminated_length": 1566.011962890625, + "completions/min_length": 1051.0, + "completions/min_terminated_length": 1051.0, + "epoch": 0.6682242990654206, + "grad_norm": 0.5974320769309998, + "kl": 0.04985599033534527, + "learning_rate": 1.7387499999999999e-06, + "loss": 0.0317, + "num_tokens": 55205916.0, + "reward": 1.472242832183838, + "reward_std": 0.13399535417556763, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4960523247718811, + "rewards/correct_reward_func/std": 0.16596059501171112, + "step": 429 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2914.0, + "completions/max_terminated_length": 2914.0, + "completions/mean_length": 1612.9881591796875, + "completions/mean_terminated_length": 1612.9881591796875, + "completions/min_length": 673.0, + "completions/min_terminated_length": 673.0, + "epoch": 0.6697819314641744, + "grad_norm": 0.5865140557289124, + "kl": 0.05195549875497818, + "learning_rate": 1.7381249999999998e-06, + "loss": 0.0019, + "num_tokens": 55347581.0, + "reward": 1.4284037351608276, + "reward_std": 0.09993235766887665, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.45221319794654846, + "rewards/correct_reward_func/std": 0.11526290327310562, + "step": 430 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2541.0, + "completions/max_terminated_length": 2541.0, + "completions/mean_length": 1535.5714111328125, + "completions/mean_terminated_length": 1535.5714111328125, + "completions/min_length": 992.0, + "completions/min_terminated_length": 992.0, + "epoch": 0.6713395638629284, + "grad_norm": 0.5939237475395203, + "kl": 0.05092081241309643, + "learning_rate": 1.7374999999999999e-06, + "loss": 0.0117, + "num_tokens": 55482497.0, + "reward": 1.5227851867675781, + "reward_std": 0.09420502930879593, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5346897840499878, + "rewards/correct_reward_func/std": 0.13495904207229614, + "step": 431 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2354.0, + "completions/max_terminated_length": 2354.0, + "completions/mean_length": 1559.416748046875, + "completions/mean_terminated_length": 1559.416748046875, + "completions/min_length": 928.0, + "completions/min_terminated_length": 928.0, + "epoch": 0.6728971962616822, + "grad_norm": 0.5865123271942139, + "kl": 0.05216217786073685, + "learning_rate": 1.7368749999999998e-06, + "loss": 0.0131, + "num_tokens": 55619320.0, + "reward": 1.5050667524337769, + "reward_std": 0.08095559477806091, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5169714689254761, + "rewards/correct_reward_func/std": 0.18940287828445435, + "step": 432 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2694.0, + "completions/max_terminated_length": 2694.0, + "completions/mean_length": 1583.1429443359375, + "completions/mean_terminated_length": 1583.1429443359375, + "completions/min_length": 918.0, + "completions/min_terminated_length": 918.0, + "epoch": 0.6744548286604362, + "grad_norm": 0.6121124625205994, + "kl": 0.04994286224246025, + "learning_rate": 1.7362499999999999e-06, + "loss": 0.0266, + "num_tokens": 55758322.0, + "reward": 1.4305202960968018, + "reward_std": 0.0557343065738678, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43052029609680176, + "rewards/correct_reward_func/std": 0.15990039706230164, + "step": 433 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 4451.0, + "completions/mean_length": 1662.0357666015625, + "completions/mean_terminated_length": 1583.361328125, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "epoch": 0.67601246105919, + "grad_norm": 0.5868165493011475, + "kl": 0.04885072074830532, + "learning_rate": 1.7356249999999998e-06, + "loss": 0.0522, + "num_tokens": 55903879.0, + "reward": 1.4234789609909058, + "reward_std": 0.1545991748571396, + "rewards/contains_chinese/mean": 0.9523809552192688, + "rewards/contains_chinese/std": 0.21423791348934174, + "rewards/correct_reward_func/mean": 0.47109803557395935, + "rewards/correct_reward_func/std": 0.1350628137588501, + "step": 434 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2375.0, + "completions/max_terminated_length": 2375.0, + "completions/mean_length": 1513.5714111328125, + "completions/mean_terminated_length": 1513.5714111328125, + "completions/min_length": 948.0, + "completions/min_terminated_length": 948.0, + "epoch": 0.677570093457944, + "grad_norm": 0.5805670619010925, + "kl": 0.052099065855145454, + "learning_rate": 1.7350000000000001e-06, + "loss": -0.0069, + "num_tokens": 56036809.0, + "reward": 1.4437384605407715, + "reward_std": 0.05860109254717827, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44373825192451477, + "rewards/correct_reward_func/std": 0.19147835671901703, + "step": 435 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2622.0, + "completions/mean_length": 1667.5238037109375, + "completions/mean_terminated_length": 1588.9156494140625, + "completions/min_length": 975.0, + "completions/min_terminated_length": 975.0, + "epoch": 0.6791277258566978, + "grad_norm": 0.5827205181121826, + "kl": 0.048958078026771545, + "learning_rate": 1.734375e-06, + "loss": 0.0677, + "num_tokens": 56182893.0, + "reward": 1.4653565883636475, + "reward_std": 0.08377533406019211, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4653565585613251, + "rewards/correct_reward_func/std": 0.17159578204154968, + "step": 436 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2398.0, + "completions/mean_length": 1661.6785888671875, + "completions/mean_terminated_length": 1583.0, + "completions/min_length": 852.0, + "completions/min_terminated_length": 852.0, + "epoch": 0.6806853582554517, + "grad_norm": 0.5805427432060242, + "kl": 0.04939436540007591, + "learning_rate": 1.73375e-06, + "loss": -0.0382, + "num_tokens": 56328444.0, + "reward": 1.443485140800476, + "reward_std": 0.06904201209545135, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44348499178886414, + "rewards/correct_reward_func/std": 0.1347956359386444, + "step": 437 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2602.0, + "completions/mean_length": 1691.107177734375, + "completions/mean_terminated_length": 1612.7830810546875, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "epoch": 0.6822429906542056, + "grad_norm": 0.5515686273574829, + "kl": 0.05067290551960468, + "learning_rate": 1.733125e-06, + "loss": 0.0507, + "num_tokens": 56476701.0, + "reward": 1.4295130968093872, + "reward_std": 0.11515135318040848, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4533223509788513, + "rewards/correct_reward_func/std": 0.16026785969734192, + "step": 438 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2201.0, + "completions/max_terminated_length": 2201.0, + "completions/mean_length": 1521.6309814453125, + "completions/mean_terminated_length": 1521.6309814453125, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "epoch": 0.6838006230529595, + "grad_norm": 0.5936988592147827, + "kl": 0.052045663818717, + "learning_rate": 1.7325e-06, + "loss": 0.031, + "num_tokens": 56610566.0, + "reward": 1.4802929162979126, + "reward_std": 0.08335726708173752, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.504102349281311, + "rewards/correct_reward_func/std": 0.1541454792022705, + "step": 439 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2006.0, + "completions/max_terminated_length": 2006.0, + "completions/mean_length": 1464.0595703125, + "completions/mean_terminated_length": 1464.0595703125, + "completions/min_length": 916.0, + "completions/min_terminated_length": 916.0, + "epoch": 0.6853582554517134, + "grad_norm": 0.6046100854873657, + "kl": 0.05066749081015587, + "learning_rate": 1.731875e-06, + "loss": -0.006, + "num_tokens": 56739727.0, + "reward": 1.4421093463897705, + "reward_std": 0.05164036527276039, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4421093463897705, + "rewards/correct_reward_func/std": 0.14187321066856384, + "step": 440 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2175.0, + "completions/mean_length": 1621.3929443359375, + "completions/mean_terminated_length": 1461.134033203125, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "epoch": 0.6869158878504673, + "grad_norm": 0.5737205147743225, + "kl": 0.04900176823139191, + "learning_rate": 1.73125e-06, + "loss": 0.0934, + "num_tokens": 56881816.0, + "reward": 1.42433500289917, + "reward_std": 0.0748237892985344, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.42433494329452515, + "rewards/correct_reward_func/std": 0.18206322193145752, + "step": 441 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2703.0, + "completions/max_terminated_length": 2703.0, + "completions/mean_length": 1491.047607421875, + "completions/mean_terminated_length": 1491.047607421875, + "completions/min_length": 932.0, + "completions/min_terminated_length": 932.0, + "epoch": 0.6884735202492211, + "grad_norm": 0.6053724884986877, + "kl": 0.052103569731116295, + "learning_rate": 1.730625e-06, + "loss": -0.0204, + "num_tokens": 57012968.0, + "reward": 1.5510308742523193, + "reward_std": 0.09038885682821274, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5629354119300842, + "rewards/correct_reward_func/std": 0.22346119582653046, + "step": 442 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2108.0, + "completions/max_terminated_length": 2108.0, + "completions/mean_length": 1477.84521484375, + "completions/mean_terminated_length": 1477.84521484375, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "epoch": 0.6900311526479751, + "grad_norm": 0.5881515741348267, + "kl": 0.05128224939107895, + "learning_rate": 1.73e-06, + "loss": -0.0017, + "num_tokens": 57143023.0, + "reward": 1.4883114099502563, + "reward_std": 0.062143657356500626, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48831140995025635, + "rewards/correct_reward_func/std": 0.13774645328521729, + "step": 443 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2033.0, + "completions/max_terminated_length": 2033.0, + "completions/mean_length": 1422.7261962890625, + "completions/mean_terminated_length": 1422.7261962890625, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "epoch": 0.6915887850467289, + "grad_norm": 0.6870121359825134, + "kl": 0.0524381622672081, + "learning_rate": 1.729375e-06, + "loss": 0.0179, + "num_tokens": 57268346.0, + "reward": 1.4325767755508423, + "reward_std": 0.11797544360160828, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4444815218448639, + "rewards/correct_reward_func/std": 0.13778088986873627, + "step": 444 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2291.0, + "completions/max_terminated_length": 2291.0, + "completions/mean_length": 1486.011962890625, + "completions/mean_terminated_length": 1486.011962890625, + "completions/min_length": 435.0, + "completions/min_terminated_length": 435.0, + "epoch": 0.6931464174454829, + "grad_norm": 0.589530348777771, + "kl": 0.04958914779126644, + "learning_rate": 1.72875e-06, + "loss": -0.0082, + "num_tokens": 57399075.0, + "reward": 1.4617598056793213, + "reward_std": 0.0732613280415535, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4617597460746765, + "rewards/correct_reward_func/std": 0.20478412508964539, + "step": 445 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2343.0, + "completions/max_terminated_length": 2343.0, + "completions/mean_length": 1440.261962890625, + "completions/mean_terminated_length": 1440.261962890625, + "completions/min_length": 687.0, + "completions/min_terminated_length": 687.0, + "epoch": 0.6947040498442367, + "grad_norm": 0.5789417624473572, + "kl": 0.049345508217811584, + "learning_rate": 1.7281249999999999e-06, + "loss": -0.0048, + "num_tokens": 57526129.0, + "reward": 1.4764251708984375, + "reward_std": 0.060231760144233704, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4764251708984375, + "rewards/correct_reward_func/std": 0.15916836261749268, + "step": 446 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2026.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1481.5, + "completions/mean_terminated_length": 1481.5, + "completions/min_length": 756.0, + "completions/min_terminated_length": 756.0, + "epoch": 0.6962616822429907, + "grad_norm": 0.5849701762199402, + "kl": 0.052394647151231766, + "learning_rate": 1.7275e-06, + "loss": -0.0097, + "num_tokens": 57656521.0, + "reward": 1.5263036489486694, + "reward_std": 0.07463201880455017, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5263035893440247, + "rewards/correct_reward_func/std": 0.17446979880332947, + "step": 447 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3242.0, + "completions/max_terminated_length": 3242.0, + "completions/mean_length": 1495.3095703125, + "completions/mean_terminated_length": 1495.3095703125, + "completions/min_length": 905.0, + "completions/min_terminated_length": 905.0, + "epoch": 0.6978193146417445, + "grad_norm": 0.628301203250885, + "kl": 0.05113241821527481, + "learning_rate": 1.7268749999999999e-06, + "loss": 0.0129, + "num_tokens": 57788163.0, + "reward": 1.4820013046264648, + "reward_std": 0.09342510253190994, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.49390602111816406, + "rewards/correct_reward_func/std": 0.12805330753326416, + "step": 448 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2319.0, + "completions/max_terminated_length": 2319.0, + "completions/mean_length": 1449.011962890625, + "completions/mean_terminated_length": 1449.011962890625, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "epoch": 0.6993769470404985, + "grad_norm": 0.626068651676178, + "kl": 0.05249497666954994, + "learning_rate": 1.72625e-06, + "loss": 0.0168, + "num_tokens": 57916012.0, + "reward": 1.4972912073135376, + "reward_std": 0.07907280325889587, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.509195864200592, + "rewards/correct_reward_func/std": 0.15952186286449432, + "step": 449 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1982.0, + "completions/max_terminated_length": 1982.0, + "completions/mean_length": 1396.7381591796875, + "completions/mean_terminated_length": 1396.7381591796875, + "completions/min_length": 795.0, + "completions/min_terminated_length": 795.0, + "epoch": 0.7009345794392523, + "grad_norm": 0.5788159966468811, + "kl": 0.0519126933068037, + "learning_rate": 1.7256249999999999e-06, + "loss": -0.0161, + "num_tokens": 58039266.0, + "reward": 1.5347965955734253, + "reward_std": 0.05890589952468872, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5347966551780701, + "rewards/correct_reward_func/std": 0.13743919134140015, + "step": 450 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1962.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 1381.011962890625, + "completions/mean_terminated_length": 1381.011962890625, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "epoch": 0.7024922118380063, + "grad_norm": 0.6056314706802368, + "kl": 0.05103152059018612, + "learning_rate": 1.725e-06, + "loss": 0.0104, + "num_tokens": 58161217.0, + "reward": 1.479777455329895, + "reward_std": 0.06452760100364685, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47977739572525024, + "rewards/correct_reward_func/std": 0.1589631587266922, + "step": 451 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1997.0, + "completions/max_terminated_length": 1997.0, + "completions/mean_length": 1404.9285888671875, + "completions/mean_terminated_length": 1404.9285888671875, + "completions/min_length": 574.0, + "completions/min_terminated_length": 574.0, + "epoch": 0.7040498442367601, + "grad_norm": 0.6056322455406189, + "kl": 0.05121096037328243, + "learning_rate": 1.724375e-06, + "loss": 0.0433, + "num_tokens": 58285201.0, + "reward": 1.568735957145691, + "reward_std": 0.0742240622639656, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5687359571456909, + "rewards/correct_reward_func/std": 0.1560218781232834, + "step": 452 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2313.0, + "completions/mean_length": 1456.1429443359375, + "completions/mean_terminated_length": 1374.9879150390625, + "completions/min_length": 926.0, + "completions/min_terminated_length": 926.0, + "epoch": 0.705607476635514, + "grad_norm": 0.6087128520011902, + "kl": 0.05143558606505394, + "learning_rate": 1.7237499999999998e-06, + "loss": 0.0035, + "num_tokens": 58413421.0, + "reward": 1.4904062747955322, + "reward_std": 0.09947887063026428, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5023109912872314, + "rewards/correct_reward_func/std": 0.18832674622535706, + "step": 453 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2173.0, + "completions/max_terminated_length": 2173.0, + "completions/mean_length": 1527.4285888671875, + "completions/mean_terminated_length": 1527.4285888671875, + "completions/min_length": 585.0, + "completions/min_terminated_length": 585.0, + "epoch": 0.7071651090342679, + "grad_norm": 0.607707679271698, + "kl": 0.054172057658433914, + "learning_rate": 1.723125e-06, + "loss": -0.0295, + "num_tokens": 58547887.0, + "reward": 1.4630801677703857, + "reward_std": 0.07300285249948502, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4749848544597626, + "rewards/correct_reward_func/std": 0.16017581522464752, + "step": 454 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2094.0, + "completions/max_terminated_length": 2094.0, + "completions/mean_length": 1457.2857666015625, + "completions/mean_terminated_length": 1457.2857666015625, + "completions/min_length": 841.0, + "completions/min_terminated_length": 841.0, + "epoch": 0.7087227414330218, + "grad_norm": 0.6382337212562561, + "kl": 0.05202684551477432, + "learning_rate": 1.7224999999999998e-06, + "loss": -0.0015, + "num_tokens": 58676275.0, + "reward": 1.4991950988769531, + "reward_std": 0.10090982168912888, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5110997557640076, + "rewards/correct_reward_func/std": 0.14260563254356384, + "step": 455 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2076.0, + "completions/mean_length": 1584.9285888671875, + "completions/mean_terminated_length": 1423.7803955078125, + "completions/min_length": 477.0, + "completions/min_terminated_length": 477.0, + "epoch": 0.7102803738317757, + "grad_norm": 0.567070722579956, + "kl": 0.08697609417140484, + "learning_rate": 1.721875e-06, + "loss": 0.104, + "num_tokens": 58815337.0, + "reward": 1.4610228538513184, + "reward_std": 0.08824088424444199, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46102282404899597, + "rewards/correct_reward_func/std": 0.17577558755874634, + "step": 456 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2597.0, + "completions/mean_length": 1508.1190185546875, + "completions/mean_terminated_length": 1427.59033203125, + "completions/min_length": 826.0, + "completions/min_terminated_length": 826.0, + "epoch": 0.7118380062305296, + "grad_norm": 0.579940140247345, + "kl": 0.04937991686165333, + "learning_rate": 1.7212499999999998e-06, + "loss": 0.0414, + "num_tokens": 58947815.0, + "reward": 1.44125497341156, + "reward_std": 0.09176965802907944, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4531596899032593, + "rewards/correct_reward_func/std": 0.11111555248498917, + "step": 457 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2106.0, + "completions/max_terminated_length": 2106.0, + "completions/mean_length": 1457.7261962890625, + "completions/mean_terminated_length": 1457.7261962890625, + "completions/min_length": 845.0, + "completions/min_terminated_length": 845.0, + "epoch": 0.7133956386292835, + "grad_norm": 0.5890275239944458, + "kl": 0.04897093586623669, + "learning_rate": 1.720625e-06, + "loss": -0.0176, + "num_tokens": 59076216.0, + "reward": 1.4860827922821045, + "reward_std": 0.08254723250865936, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4979875087738037, + "rewards/correct_reward_func/std": 0.14583896100521088, + "step": 458 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2329.0, + "completions/max_terminated_length": 2329.0, + "completions/mean_length": 1443.0, + "completions/mean_terminated_length": 1443.0, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "epoch": 0.7149532710280374, + "grad_norm": 0.6299088001251221, + "kl": 0.05053492821753025, + "learning_rate": 1.7199999999999998e-06, + "loss": 0.0295, + "num_tokens": 59203470.0, + "reward": 1.4992488622665405, + "reward_std": 0.08619312942028046, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5111536979675293, + "rewards/correct_reward_func/std": 0.1396382600069046, + "step": 459 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2038.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 1419.607177734375, + "completions/mean_terminated_length": 1419.607177734375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "epoch": 0.7165109034267912, + "grad_norm": 0.6011829376220703, + "kl": 0.051410723477602005, + "learning_rate": 1.719375e-06, + "loss": 0.0157, + "num_tokens": 59328723.0, + "reward": 1.5500842332839966, + "reward_std": 0.06590086221694946, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5500842332839966, + "rewards/correct_reward_func/std": 0.15555351972579956, + "step": 460 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2246.0, + "completions/mean_length": 1639.3690185546875, + "completions/mean_terminated_length": 1479.5487060546875, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "epoch": 0.7180685358255452, + "grad_norm": 0.5221880078315735, + "kl": 0.04706592485308647, + "learning_rate": 1.7187499999999998e-06, + "loss": 0.1247, + "num_tokens": 59472472.0, + "reward": 1.4629874229431152, + "reward_std": 0.11201505362987518, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.47489219903945923, + "rewards/correct_reward_func/std": 0.16951104998588562, + "step": 461 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2356.0, + "completions/max_terminated_length": 2356.0, + "completions/mean_length": 1470.6785888671875, + "completions/mean_terminated_length": 1470.6785888671875, + "completions/min_length": 944.0, + "completions/min_terminated_length": 944.0, + "epoch": 0.719626168224299, + "grad_norm": 0.605837345123291, + "kl": 0.04970187321305275, + "learning_rate": 1.7181249999999997e-06, + "loss": 0.0014, + "num_tokens": 59602219.0, + "reward": 1.5099036693572998, + "reward_std": 0.06017957627773285, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5099035501480103, + "rewards/correct_reward_func/std": 0.1773725152015686, + "step": 462 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2662.0, + "completions/max_terminated_length": 2662.0, + "completions/mean_length": 1585.297607421875, + "completions/mean_terminated_length": 1585.297607421875, + "completions/min_length": 985.0, + "completions/min_terminated_length": 985.0, + "epoch": 0.721183800623053, + "grad_norm": 0.5602222681045532, + "kl": 0.049900198355317116, + "learning_rate": 1.7174999999999999e-06, + "loss": -0.0306, + "num_tokens": 59741534.0, + "reward": 1.505386233329773, + "reward_std": 0.07570932060480118, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5053861141204834, + "rewards/correct_reward_func/std": 0.11848840862512589, + "step": 463 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2467.0, + "completions/max_terminated_length": 2467.0, + "completions/mean_length": 1453.7381591796875, + "completions/mean_terminated_length": 1453.7381591796875, + "completions/min_length": 835.0, + "completions/min_terminated_length": 835.0, + "epoch": 0.7227414330218068, + "grad_norm": 0.5771290063858032, + "kl": 0.05098097398877144, + "learning_rate": 1.7168749999999998e-06, + "loss": -0.0083, + "num_tokens": 59869402.0, + "reward": 1.41793692111969, + "reward_std": 0.07035666704177856, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.41793686151504517, + "rewards/correct_reward_func/std": 0.12359312176704407, + "step": 464 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2362.0, + "completions/max_terminated_length": 2362.0, + "completions/mean_length": 1601.15478515625, + "completions/mean_terminated_length": 1601.15478515625, + "completions/min_length": 967.0, + "completions/min_terminated_length": 967.0, + "epoch": 0.7242990654205608, + "grad_norm": 0.5878456830978394, + "kl": 0.051242388784885406, + "learning_rate": 1.7162499999999999e-06, + "loss": -0.0256, + "num_tokens": 60010073.0, + "reward": 1.5170272588729858, + "reward_std": 0.08878003805875778, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5289318561553955, + "rewards/correct_reward_func/std": 0.1320001482963562, + "step": 465 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3673.0, + "completions/max_terminated_length": 3673.0, + "completions/mean_length": 1584.96435546875, + "completions/mean_terminated_length": 1584.96435546875, + "completions/min_length": 880.0, + "completions/min_terminated_length": 880.0, + "epoch": 0.7258566978193146, + "grad_norm": 0.56437748670578, + "kl": 0.05264845862984657, + "learning_rate": 1.7156249999999998e-06, + "loss": -0.0124, + "num_tokens": 60149144.0, + "reward": 1.449500560760498, + "reward_std": 0.06773830950260162, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44950050115585327, + "rewards/correct_reward_func/std": 0.15649531781673431, + "step": 466 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2261.0, + "completions/max_terminated_length": 2261.0, + "completions/mean_length": 1570.761962890625, + "completions/mean_terminated_length": 1570.761962890625, + "completions/min_length": 989.0, + "completions/min_terminated_length": 989.0, + "epoch": 0.7274143302180686, + "grad_norm": 0.586487352848053, + "kl": 0.05096551589667797, + "learning_rate": 1.715e-06, + "loss": -0.002, + "num_tokens": 60287136.0, + "reward": 1.5255881547927856, + "reward_std": 0.08541964739561081, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5374928712844849, + "rewards/correct_reward_func/std": 0.18721869587898254, + "step": 467 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2230.0, + "completions/max_terminated_length": 2230.0, + "completions/mean_length": 1424.4405517578125, + "completions/mean_terminated_length": 1424.4405517578125, + "completions/min_length": 53.0, + "completions/min_terminated_length": 53.0, + "epoch": 0.7289719626168224, + "grad_norm": 0.5780627727508545, + "kl": 0.05155480466783047, + "learning_rate": 1.714375e-06, + "loss": -0.0576, + "num_tokens": 60412483.0, + "reward": 1.4178261756896973, + "reward_std": 0.0780172049999237, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4178261458873749, + "rewards/correct_reward_func/std": 0.15439294278621674, + "step": 468 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2758.0, + "completions/max_terminated_length": 2758.0, + "completions/mean_length": 1656.5357666015625, + "completions/mean_terminated_length": 1656.5357666015625, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "epoch": 0.7305295950155763, + "grad_norm": 0.602411687374115, + "kl": 0.05079780891537666, + "learning_rate": 1.7137500000000001e-06, + "loss": -0.0068, + "num_tokens": 60557530.0, + "reward": 1.455553412437439, + "reward_std": 0.05027348920702934, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45555347204208374, + "rewards/correct_reward_func/std": 0.08646845817565918, + "step": 469 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2639.0, + "completions/mean_length": 1774.96435546875, + "completions/mean_terminated_length": 1697.6505126953125, + "completions/min_length": 1001.0, + "completions/min_terminated_length": 1001.0, + "epoch": 0.7320872274143302, + "grad_norm": 0.5296007394790649, + "kl": 0.04803318716585636, + "learning_rate": 1.713125e-06, + "loss": 0.0229, + "num_tokens": 60712831.0, + "reward": 1.4119234085083008, + "reward_std": 0.0688636526465416, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.411923348903656, + "rewards/correct_reward_func/std": 0.15681524574756622, + "step": 470 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2303.0, + "completions/max_terminated_length": 2303.0, + "completions/mean_length": 1522.75, + "completions/mean_terminated_length": 1522.75, + "completions/min_length": 774.0, + "completions/min_terminated_length": 774.0, + "epoch": 0.7336448598130841, + "grad_norm": 0.5558991432189941, + "kl": 0.051254723221063614, + "learning_rate": 1.7125e-06, + "loss": 0.0205, + "num_tokens": 60846634.0, + "reward": 1.4586231708526611, + "reward_std": 0.08559418469667435, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4705279767513275, + "rewards/correct_reward_func/std": 0.13128410279750824, + "step": 471 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2990.0, + "completions/max_terminated_length": 2990.0, + "completions/mean_length": 1663.6785888671875, + "completions/mean_terminated_length": 1663.6785888671875, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "epoch": 0.735202492211838, + "grad_norm": 0.5688772797584534, + "kl": 0.0507583636790514, + "learning_rate": 1.711875e-06, + "loss": 0.0262, + "num_tokens": 60992275.0, + "reward": 1.5023914575576782, + "reward_std": 0.1096097081899643, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5142960548400879, + "rewards/correct_reward_func/std": 0.14677509665489197, + "step": 472 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2984.0, + "completions/max_terminated_length": 2984.0, + "completions/mean_length": 1556.7738037109375, + "completions/mean_terminated_length": 1556.7738037109375, + "completions/min_length": 798.0, + "completions/min_terminated_length": 798.0, + "epoch": 0.7367601246105919, + "grad_norm": 0.6016846895217896, + "kl": 0.05065236613154411, + "learning_rate": 1.71125e-06, + "loss": 0.0569, + "num_tokens": 61128960.0, + "reward": 1.4518747329711914, + "reward_std": 0.06996078789234161, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46377936005592346, + "rewards/correct_reward_func/std": 0.1257256120443344, + "step": 473 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2511.0, + "completions/max_terminated_length": 2511.0, + "completions/mean_length": 1696.1429443359375, + "completions/mean_terminated_length": 1696.1429443359375, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "epoch": 0.7383177570093458, + "grad_norm": 0.5974622964859009, + "kl": 0.05056299455463886, + "learning_rate": 1.710625e-06, + "loss": -0.0015, + "num_tokens": 61277664.0, + "reward": 1.4762911796569824, + "reward_std": 0.04775853455066681, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47629112005233765, + "rewards/correct_reward_func/std": 0.1305771917104721, + "step": 474 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2737.0, + "completions/max_terminated_length": 2737.0, + "completions/mean_length": 1597.011962890625, + "completions/mean_terminated_length": 1597.011962890625, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "epoch": 0.7398753894080997, + "grad_norm": 0.5798767805099487, + "kl": 0.05241680145263672, + "learning_rate": 1.71e-06, + "loss": 0.001, + "num_tokens": 61417813.0, + "reward": 1.4604065418243408, + "reward_std": 0.07921247184276581, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4604065418243408, + "rewards/correct_reward_func/std": 0.12199635803699493, + "step": 475 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2322.0, + "completions/max_terminated_length": 2322.0, + "completions/mean_length": 1607.8929443359375, + "completions/mean_terminated_length": 1607.8929443359375, + "completions/min_length": 960.0, + "completions/min_terminated_length": 960.0, + "epoch": 0.7414330218068536, + "grad_norm": 0.5608850717544556, + "kl": 0.05262966826558113, + "learning_rate": 1.709375e-06, + "loss": 0.0272, + "num_tokens": 61558840.0, + "reward": 1.429826021194458, + "reward_std": 0.07498659938573837, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4417307674884796, + "rewards/correct_reward_func/std": 0.16307246685028076, + "step": 476 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2451.0, + "completions/max_terminated_length": 2451.0, + "completions/mean_length": 1651.3929443359375, + "completions/mean_terminated_length": 1651.3929443359375, + "completions/min_length": 886.0, + "completions/min_terminated_length": 886.0, + "epoch": 0.7429906542056075, + "grad_norm": 0.5357276201248169, + "kl": 0.05072159692645073, + "learning_rate": 1.70875e-06, + "loss": 0.0321, + "num_tokens": 61703707.0, + "reward": 1.4157724380493164, + "reward_std": 0.10736193507909775, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.43958187103271484, + "rewards/correct_reward_func/std": 0.12011130154132843, + "step": 477 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2437.0, + "completions/mean_length": 1622.2261962890625, + "completions/mean_terminated_length": 1543.072265625, + "completions/min_length": 736.0, + "completions/min_terminated_length": 736.0, + "epoch": 0.7445482866043613, + "grad_norm": 0.5624713897705078, + "kl": 0.050551433116197586, + "learning_rate": 1.7081249999999998e-06, + "loss": 0.0817, + "num_tokens": 61845956.0, + "reward": 1.4873374700546265, + "reward_std": 0.1054694801568985, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4992421567440033, + "rewards/correct_reward_func/std": 0.1856101006269455, + "step": 478 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 7608.0, + "completions/max_terminated_length": 7608.0, + "completions/mean_length": 1654.2738037109375, + "completions/mean_terminated_length": 1654.2738037109375, + "completions/min_length": 1067.0, + "completions/min_terminated_length": 1067.0, + "epoch": 0.7461059190031153, + "grad_norm": 0.5814685821533203, + "kl": 0.05036089010536671, + "learning_rate": 1.7075e-06, + "loss": 0.0464, + "num_tokens": 61991041.0, + "reward": 1.451461911201477, + "reward_std": 0.09037837386131287, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4633665978908539, + "rewards/correct_reward_func/std": 0.12058395892381668, + "step": 479 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2120.0, + "completions/max_terminated_length": 2120.0, + "completions/mean_length": 1494.3214111328125, + "completions/mean_terminated_length": 1494.3214111328125, + "completions/min_length": 1001.0, + "completions/min_terminated_length": 1001.0, + "epoch": 0.7476635514018691, + "grad_norm": 0.5886862277984619, + "kl": 0.050980525091290474, + "learning_rate": 1.7068749999999999e-06, + "loss": 0.0151, + "num_tokens": 62122528.0, + "reward": 1.461233139038086, + "reward_std": 0.07253991812467575, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.47313785552978516, + "rewards/correct_reward_func/std": 0.14707376062870026, + "step": 480 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2365.0, + "completions/max_terminated_length": 2365.0, + "completions/mean_length": 1512.6309814453125, + "completions/mean_terminated_length": 1512.6309814453125, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "epoch": 0.7492211838006231, + "grad_norm": 0.5945307016372681, + "kl": 0.0515163391828537, + "learning_rate": 1.70625e-06, + "loss": 0.0025, + "num_tokens": 62255565.0, + "reward": 1.5655839443206787, + "reward_std": 0.07332275062799454, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5655838251113892, + "rewards/correct_reward_func/std": 0.18415912985801697, + "step": 481 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2281.0, + "completions/max_terminated_length": 2281.0, + "completions/mean_length": 1462.0357666015625, + "completions/mean_terminated_length": 1462.0357666015625, + "completions/min_length": 871.0, + "completions/min_terminated_length": 871.0, + "epoch": 0.7507788161993769, + "grad_norm": 0.575648307800293, + "kl": 0.052306439727544785, + "learning_rate": 1.7056249999999999e-06, + "loss": -0.0139, + "num_tokens": 62384370.0, + "reward": 1.5397838354110718, + "reward_std": 0.0683126300573349, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.539783775806427, + "rewards/correct_reward_func/std": 0.17842523753643036, + "step": 482 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2599.0, + "completions/max_terminated_length": 2599.0, + "completions/mean_length": 1485.84521484375, + "completions/mean_terminated_length": 1485.84521484375, + "completions/min_length": 810.0, + "completions/min_terminated_length": 810.0, + "epoch": 0.7523364485981309, + "grad_norm": 0.5642300248146057, + "kl": 0.05181148275732994, + "learning_rate": 1.705e-06, + "loss": 0.0319, + "num_tokens": 62515067.0, + "reward": 1.4557358026504517, + "reward_std": 0.11825248599052429, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4795452356338501, + "rewards/correct_reward_func/std": 0.16276678442955017, + "step": 483 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2106.0, + "completions/max_terminated_length": 2106.0, + "completions/mean_length": 1477.65478515625, + "completions/mean_terminated_length": 1477.65478515625, + "completions/min_length": 900.0, + "completions/min_terminated_length": 900.0, + "epoch": 0.7538940809968847, + "grad_norm": 0.5979113578796387, + "kl": 0.05194063484668732, + "learning_rate": 1.7043749999999999e-06, + "loss": 0.0126, + "num_tokens": 62645214.0, + "reward": 1.4856928586959839, + "reward_std": 0.0904906839132309, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4975975453853607, + "rewards/correct_reward_func/std": 0.13653963804244995, + "step": 484 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2521.0, + "completions/max_terminated_length": 2521.0, + "completions/mean_length": 1486.6309814453125, + "completions/mean_terminated_length": 1486.6309814453125, + "completions/min_length": 902.0, + "completions/min_terminated_length": 902.0, + "epoch": 0.7554517133956387, + "grad_norm": 0.6199777722358704, + "kl": 0.05140496790409088, + "learning_rate": 1.70375e-06, + "loss": 0.0157, + "num_tokens": 62776049.0, + "reward": 1.491317629814148, + "reward_std": 0.06931986659765244, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4913175702095032, + "rewards/correct_reward_func/std": 0.14081616699695587, + "step": 485 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1881.0, + "completions/max_terminated_length": 1881.0, + "completions/mean_length": 1389.3095703125, + "completions/mean_terminated_length": 1389.3095703125, + "completions/min_length": 750.0, + "completions/min_terminated_length": 750.0, + "epoch": 0.7570093457943925, + "grad_norm": 0.5803106427192688, + "kl": 0.05373929440975189, + "learning_rate": 1.7031249999999999e-06, + "loss": -0.0063, + "num_tokens": 62898547.0, + "reward": 1.4282513856887817, + "reward_std": 0.06992341578006744, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4282512664794922, + "rewards/correct_reward_func/std": 0.14699019491672516, + "step": 486 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2418.0, + "completions/max_terminated_length": 2418.0, + "completions/mean_length": 1418.5357666015625, + "completions/mean_terminated_length": 1418.5357666015625, + "completions/min_length": 809.0, + "completions/min_terminated_length": 809.0, + "epoch": 0.7585669781931464, + "grad_norm": 0.620182454586029, + "kl": 0.0546103548258543, + "learning_rate": 1.7024999999999998e-06, + "loss": -0.0024, + "num_tokens": 63023800.0, + "reward": 1.4738986492156982, + "reward_std": 0.11807496100664139, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4858033359050751, + "rewards/correct_reward_func/std": 0.16928018629550934, + "step": 487 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2352.0, + "completions/max_terminated_length": 2352.0, + "completions/mean_length": 1467.21435546875, + "completions/mean_terminated_length": 1467.21435546875, + "completions/min_length": 749.0, + "completions/min_terminated_length": 749.0, + "epoch": 0.7601246105919003, + "grad_norm": 0.5910825729370117, + "kl": 0.0529879629611969, + "learning_rate": 1.701875e-06, + "loss": 0.0043, + "num_tokens": 63153034.0, + "reward": 1.4873260259628296, + "reward_std": 0.05371030792593956, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4873259365558624, + "rewards/correct_reward_func/std": 0.11501560360193253, + "step": 488 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2009.0, + "completions/max_terminated_length": 2009.0, + "completions/mean_length": 1416.6429443359375, + "completions/mean_terminated_length": 1416.6429443359375, + "completions/min_length": 823.0, + "completions/min_terminated_length": 823.0, + "epoch": 0.7616822429906542, + "grad_norm": 0.617638885974884, + "kl": 0.05328808352351189, + "learning_rate": 1.7012499999999998e-06, + "loss": 0.0089, + "num_tokens": 63278134.0, + "reward": 1.4918674230575562, + "reward_std": 0.05592425912618637, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49186742305755615, + "rewards/correct_reward_func/std": 0.11877977102994919, + "step": 489 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2390.0, + "completions/max_terminated_length": 2390.0, + "completions/mean_length": 1438.6309814453125, + "completions/mean_terminated_length": 1438.6309814453125, + "completions/min_length": 671.0, + "completions/min_terminated_length": 671.0, + "epoch": 0.7632398753894081, + "grad_norm": 0.5881965160369873, + "kl": 0.0523222591727972, + "learning_rate": 1.700625e-06, + "loss": -0.0096, + "num_tokens": 63405033.0, + "reward": 1.4045344591140747, + "reward_std": 0.07134377211332321, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.41643914580345154, + "rewards/correct_reward_func/std": 0.1567194163799286, + "step": 490 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2166.0, + "completions/max_terminated_length": 2166.0, + "completions/mean_length": 1401.0714111328125, + "completions/mean_terminated_length": 1401.0714111328125, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "epoch": 0.764797507788162, + "grad_norm": 0.6001453995704651, + "kl": 0.05563800781965256, + "learning_rate": 1.6999999999999998e-06, + "loss": -0.0112, + "num_tokens": 63528669.0, + "reward": 1.4765801429748535, + "reward_std": 0.0643647164106369, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47658008337020874, + "rewards/correct_reward_func/std": 0.16985422372817993, + "step": 491 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2280.0, + "completions/mean_length": 1454.0357666015625, + "completions/mean_terminated_length": 1372.8553466796875, + "completions/min_length": 921.0, + "completions/min_terminated_length": 921.0, + "epoch": 0.7663551401869159, + "grad_norm": 0.6344577670097351, + "kl": 0.05234198831021786, + "learning_rate": 1.699375e-06, + "loss": 0.0894, + "num_tokens": 63656826.0, + "reward": 1.5414061546325684, + "reward_std": 0.08393041044473648, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5533110499382019, + "rewards/correct_reward_func/std": 0.12277739495038986, + "step": 492 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2018.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1342.84521484375, + "completions/mean_terminated_length": 1342.84521484375, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "epoch": 0.7679127725856698, + "grad_norm": 0.6125787496566772, + "kl": 0.05459017679095268, + "learning_rate": 1.6987499999999998e-06, + "loss": -0.0266, + "num_tokens": 63775661.0, + "reward": 1.463148832321167, + "reward_std": 0.0493975505232811, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46314874291419983, + "rewards/correct_reward_func/std": 0.12720361351966858, + "step": 493 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2163.0, + "completions/max_terminated_length": 2163.0, + "completions/mean_length": 1410.2261962890625, + "completions/mean_terminated_length": 1410.2261962890625, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "epoch": 0.7694704049844237, + "grad_norm": 0.625629186630249, + "kl": 0.05353173241019249, + "learning_rate": 1.698125e-06, + "loss": 0.0163, + "num_tokens": 63900228.0, + "reward": 1.5133934020996094, + "reward_std": 0.10190200060606003, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5252981781959534, + "rewards/correct_reward_func/std": 0.13644695281982422, + "step": 494 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2018.0, + "completions/max_terminated_length": 2018.0, + "completions/mean_length": 1312.297607421875, + "completions/mean_terminated_length": 1312.297607421875, + "completions/min_length": 691.0, + "completions/min_terminated_length": 691.0, + "epoch": 0.7710280373831776, + "grad_norm": 0.6696212887763977, + "kl": 0.05529572255909443, + "learning_rate": 1.6974999999999998e-06, + "loss": -0.0171, + "num_tokens": 64016269.0, + "reward": 1.464273452758789, + "reward_std": 0.07633471488952637, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4642733931541443, + "rewards/correct_reward_func/std": 0.1434505134820938, + "step": 495 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2636.0, + "completions/max_terminated_length": 2636.0, + "completions/mean_length": 1418.7738037109375, + "completions/mean_terminated_length": 1418.7738037109375, + "completions/min_length": 822.0, + "completions/min_terminated_length": 822.0, + "epoch": 0.7725856697819314, + "grad_norm": 0.6065709590911865, + "kl": 0.05418024770915508, + "learning_rate": 1.6968749999999997e-06, + "loss": -0.03, + "num_tokens": 64141350.0, + "reward": 1.5015569925308228, + "reward_std": 0.0749620795249939, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5015567541122437, + "rewards/correct_reward_func/std": 0.1574825942516327, + "step": 496 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2235.0, + "completions/max_terminated_length": 2235.0, + "completions/mean_length": 1359.4761962890625, + "completions/mean_terminated_length": 1359.4761962890625, + "completions/min_length": 744.0, + "completions/min_terminated_length": 744.0, + "epoch": 0.7741433021806854, + "grad_norm": 0.6393604278564453, + "kl": 0.053761230781674385, + "learning_rate": 1.6962499999999999e-06, + "loss": -0.0042, + "num_tokens": 64261606.0, + "reward": 1.5147299766540527, + "reward_std": 0.08370784670114517, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5266348719596863, + "rewards/correct_reward_func/std": 0.12140747904777527, + "step": 497 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2030.0, + "completions/max_terminated_length": 2030.0, + "completions/mean_length": 1397.642822265625, + "completions/mean_terminated_length": 1397.642822265625, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "epoch": 0.7757009345794392, + "grad_norm": 0.5982325673103333, + "kl": 0.05507444404065609, + "learning_rate": 1.695625e-06, + "loss": 0.0236, + "num_tokens": 64385140.0, + "reward": 1.4505058526992798, + "reward_std": 0.06045551598072052, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4505058526992798, + "rewards/correct_reward_func/std": 0.16863283514976501, + "step": 498 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2320.0, + "completions/max_terminated_length": 2320.0, + "completions/mean_length": 1413.607177734375, + "completions/mean_terminated_length": 1413.607177734375, + "completions/min_length": 897.0, + "completions/min_terminated_length": 897.0, + "epoch": 0.7772585669781932, + "grad_norm": 0.6396889686584473, + "kl": 0.0517488569021225, + "learning_rate": 1.695e-06, + "loss": 0.0036, + "num_tokens": 64510087.0, + "reward": 1.5013822317123413, + "reward_std": 0.05566215515136719, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5013821721076965, + "rewards/correct_reward_func/std": 0.1984904706478119, + "step": 499 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1967.0, + "completions/max_terminated_length": 1967.0, + "completions/mean_length": 1331.75, + "completions/mean_terminated_length": 1331.75, + "completions/min_length": 665.0, + "completions/min_terminated_length": 665.0, + "epoch": 0.778816199376947, + "grad_norm": 0.6182188391685486, + "kl": 0.05334976129233837, + "learning_rate": 1.694375e-06, + "loss": 0.0177, + "num_tokens": 64627834.0, + "reward": 1.4577831029891968, + "reward_std": 0.07913817465305328, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46968796849250793, + "rewards/correct_reward_func/std": 0.1922857016324997, + "step": 500 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2392.0, + "completions/max_terminated_length": 2392.0, + "completions/mean_length": 1450.797607421875, + "completions/mean_terminated_length": 1450.797607421875, + "completions/min_length": 787.0, + "completions/min_terminated_length": 787.0, + "epoch": 0.780373831775701, + "grad_norm": 0.6251698136329651, + "kl": 0.051690295338630676, + "learning_rate": 1.69375e-06, + "loss": 0.0195, + "num_tokens": 64755809.0, + "reward": 1.4815540313720703, + "reward_std": 0.06203337013721466, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48155394196510315, + "rewards/correct_reward_func/std": 0.14881980419158936, + "step": 501 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2222.0, + "completions/max_terminated_length": 2222.0, + "completions/mean_length": 1361.0, + "completions/mean_terminated_length": 1361.0, + "completions/min_length": 455.0, + "completions/min_terminated_length": 455.0, + "epoch": 0.7819314641744548, + "grad_norm": 0.6154322624206543, + "kl": 0.05483602173626423, + "learning_rate": 1.693125e-06, + "loss": -0.0267, + "num_tokens": 64876139.0, + "reward": 1.434553861618042, + "reward_std": 0.1056680679321289, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44645848870277405, + "rewards/correct_reward_func/std": 0.15556204319000244, + "step": 502 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2066.0, + "completions/max_terminated_length": 2066.0, + "completions/mean_length": 1336.75, + "completions/mean_terminated_length": 1336.75, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "epoch": 0.7834890965732088, + "grad_norm": 0.6085004210472107, + "kl": 0.05451551079750061, + "learning_rate": 1.6924999999999999e-06, + "loss": 0.027, + "num_tokens": 64994438.0, + "reward": 1.5156744718551636, + "reward_std": 0.043764952570199966, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5156744718551636, + "rewards/correct_reward_func/std": 0.10536623001098633, + "step": 503 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2154.0, + "completions/max_terminated_length": 2154.0, + "completions/mean_length": 1349.65478515625, + "completions/mean_terminated_length": 1349.65478515625, + "completions/min_length": 831.0, + "completions/min_terminated_length": 831.0, + "epoch": 0.7850467289719626, + "grad_norm": 0.6269933581352234, + "kl": 0.05425166338682175, + "learning_rate": 1.691875e-06, + "loss": 0.005, + "num_tokens": 65113491.0, + "reward": 1.4136245250701904, + "reward_std": 0.07355698943138123, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.41362443566322327, + "rewards/correct_reward_func/std": 0.11681222915649414, + "step": 504 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6143.0, + "completions/max_terminated_length": 6143.0, + "completions/mean_length": 1480.0357666015625, + "completions/mean_terminated_length": 1480.0357666015625, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "epoch": 0.7866043613707165, + "grad_norm": 0.58085036277771, + "kl": 0.05101562291383743, + "learning_rate": 1.69125e-06, + "loss": 0.0136, + "num_tokens": 65243952.0, + "reward": 1.4428439140319824, + "reward_std": 0.10061752051115036, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.45474860072135925, + "rewards/correct_reward_func/std": 0.15178316831588745, + "step": 505 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2496.0, + "completions/mean_length": 1478.357177734375, + "completions/mean_terminated_length": 1397.4698486328125, + "completions/min_length": 876.0, + "completions/min_terminated_length": 876.0, + "epoch": 0.7881619937694704, + "grad_norm": 0.614782989025116, + "kl": 0.05181491747498512, + "learning_rate": 1.690625e-06, + "loss": 0.0927, + "num_tokens": 65374230.0, + "reward": 1.5134276151657104, + "reward_std": 0.10299229621887207, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5253323912620544, + "rewards/correct_reward_func/std": 0.13395552337169647, + "step": 506 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1895.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 1314.7381591796875, + "completions/mean_terminated_length": 1314.7381591796875, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "epoch": 0.7897196261682243, + "grad_norm": 0.6423413753509521, + "kl": 0.052083175629377365, + "learning_rate": 1.69e-06, + "loss": -0.0164, + "num_tokens": 65490740.0, + "reward": 1.4805521965026855, + "reward_std": 0.07352635264396667, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4924568831920624, + "rewards/correct_reward_func/std": 0.14243850111961365, + "step": 507 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2152.0, + "completions/mean_length": 1539.2738037109375, + "completions/mean_terminated_length": 1459.1204833984375, + "completions/min_length": 953.0, + "completions/min_terminated_length": 953.0, + "epoch": 0.7912772585669782, + "grad_norm": 0.6012693643569946, + "kl": 0.052487269043922424, + "learning_rate": 1.689375e-06, + "loss": 0.0457, + "num_tokens": 65626057.0, + "reward": 1.493817687034607, + "reward_std": 0.06720510125160217, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49381768703460693, + "rewards/correct_reward_func/std": 0.141060933470726, + "step": 508 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2385.0, + "completions/mean_length": 1402.7261962890625, + "completions/mean_terminated_length": 1320.9276123046875, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "epoch": 0.7928348909657321, + "grad_norm": 0.610801100730896, + "kl": 0.053645048290491104, + "learning_rate": 1.68875e-06, + "loss": 0.0464, + "num_tokens": 65749730.0, + "reward": 1.4257547855377197, + "reward_std": 0.12775346636772156, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.44956421852111816, + "rewards/correct_reward_func/std": 0.128119558095932, + "step": 509 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2260.0, + "completions/max_terminated_length": 2260.0, + "completions/mean_length": 1397.3095703125, + "completions/mean_terminated_length": 1397.3095703125, + "completions/min_length": 715.0, + "completions/min_terminated_length": 715.0, + "epoch": 0.794392523364486, + "grad_norm": 0.6136677265167236, + "kl": 0.05390959791839123, + "learning_rate": 1.688125e-06, + "loss": 0.01, + "num_tokens": 65873218.0, + "reward": 1.447425127029419, + "reward_std": 0.07631354033946991, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4593297839164734, + "rewards/correct_reward_func/std": 0.13860400021076202, + "step": 510 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1830.0, + "completions/max_terminated_length": 1830.0, + "completions/mean_length": 1334.452392578125, + "completions/mean_terminated_length": 1334.452392578125, + "completions/min_length": 805.0, + "completions/min_terminated_length": 805.0, + "epoch": 0.7959501557632399, + "grad_norm": 0.6296293139457703, + "kl": 0.05470990762114525, + "learning_rate": 1.6875e-06, + "loss": 0.018, + "num_tokens": 65991102.0, + "reward": 1.4921993017196655, + "reward_std": 0.09120924770832062, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5041038990020752, + "rewards/correct_reward_func/std": 0.20131434500217438, + "step": 511 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2360.0, + "completions/max_terminated_length": 2360.0, + "completions/mean_length": 1491.107177734375, + "completions/mean_terminated_length": 1491.107177734375, + "completions/min_length": 717.0, + "completions/min_terminated_length": 717.0, + "epoch": 0.7975077881619937, + "grad_norm": 0.6053344011306763, + "kl": 0.054690854623913765, + "learning_rate": 1.6868749999999998e-06, + "loss": 0.0082, + "num_tokens": 66122301.0, + "reward": 1.5253815650939941, + "reward_std": 0.0556306354701519, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5253814458847046, + "rewards/correct_reward_func/std": 0.19547365605831146, + "step": 512 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2043.0, + "completions/max_terminated_length": 2043.0, + "completions/mean_length": 1394.857177734375, + "completions/mean_terminated_length": 1394.857177734375, + "completions/min_length": 842.0, + "completions/min_terminated_length": 842.0, + "epoch": 0.7990654205607477, + "grad_norm": 0.6043696403503418, + "kl": 0.05226844176650047, + "learning_rate": 1.68625e-06, + "loss": 0.0163, + "num_tokens": 66245343.0, + "reward": 1.5012750625610352, + "reward_std": 0.07726840674877167, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5012750625610352, + "rewards/correct_reward_func/std": 0.17448803782463074, + "step": 513 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2104.0, + "completions/max_terminated_length": 2104.0, + "completions/mean_length": 1384.1785888671875, + "completions/mean_terminated_length": 1384.1785888671875, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "epoch": 0.8006230529595015, + "grad_norm": 0.5974183678627014, + "kl": 0.055330896750092506, + "learning_rate": 1.6856249999999998e-06, + "loss": -0.0029, + "num_tokens": 66367602.0, + "reward": 1.39756441116333, + "reward_std": 0.10132217407226562, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.42137381434440613, + "rewards/correct_reward_func/std": 0.16763533651828766, + "step": 514 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2356.0, + "completions/max_terminated_length": 2356.0, + "completions/mean_length": 1507.5833740234375, + "completions/mean_terminated_length": 1507.5833740234375, + "completions/min_length": 941.0, + "completions/min_terminated_length": 941.0, + "epoch": 0.8021806853582555, + "grad_norm": 0.5618709325790405, + "kl": 0.05384498089551926, + "learning_rate": 1.685e-06, + "loss": 0.0189, + "num_tokens": 66500467.0, + "reward": 1.504148006439209, + "reward_std": 0.057246141135692596, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5041479468345642, + "rewards/correct_reward_func/std": 0.1694769561290741, + "step": 515 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3970.0, + "completions/max_terminated_length": 3970.0, + "completions/mean_length": 1491.3095703125, + "completions/mean_terminated_length": 1491.3095703125, + "completions/min_length": 859.0, + "completions/min_terminated_length": 859.0, + "epoch": 0.8037383177570093, + "grad_norm": 0.6067160964012146, + "kl": 0.05185644514858723, + "learning_rate": 1.6843749999999999e-06, + "loss": 0.0155, + "num_tokens": 66631719.0, + "reward": 1.4592036008834839, + "reward_std": 0.0799928829073906, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4711082875728607, + "rewards/correct_reward_func/std": 0.12382801622152328, + "step": 516 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2280.0, + "completions/max_terminated_length": 2280.0, + "completions/mean_length": 1441.6905517578125, + "completions/mean_terminated_length": 1441.6905517578125, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "epoch": 0.8052959501557633, + "grad_norm": 0.5906907916069031, + "kl": 0.05282064713537693, + "learning_rate": 1.68375e-06, + "loss": 0.0161, + "num_tokens": 66758611.0, + "reward": 1.4480403661727905, + "reward_std": 0.07139705866575241, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4599449932575226, + "rewards/correct_reward_func/std": 0.18760444223880768, + "step": 517 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2303.0, + "completions/max_terminated_length": 2303.0, + "completions/mean_length": 1492.5357666015625, + "completions/mean_terminated_length": 1492.5357666015625, + "completions/min_length": 753.0, + "completions/min_terminated_length": 753.0, + "epoch": 0.8068535825545171, + "grad_norm": 0.5900249481201172, + "kl": 0.05465748719871044, + "learning_rate": 1.6831249999999999e-06, + "loss": 0.0236, + "num_tokens": 66889984.0, + "reward": 1.473251223564148, + "reward_std": 0.05754239857196808, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4732511639595032, + "rewards/correct_reward_func/std": 0.16302239894866943, + "step": 518 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2189.0, + "completions/max_terminated_length": 2189.0, + "completions/mean_length": 1433.2381591796875, + "completions/mean_terminated_length": 1433.2381591796875, + "completions/min_length": 794.0, + "completions/min_terminated_length": 794.0, + "epoch": 0.8084112149532711, + "grad_norm": 0.6327292323112488, + "kl": 0.05179595574736595, + "learning_rate": 1.6825e-06, + "loss": -0.0127, + "num_tokens": 67016334.0, + "reward": 1.517919898033142, + "reward_std": 0.10886523127555847, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5417292714118958, + "rewards/correct_reward_func/std": 0.18844658136367798, + "step": 519 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2372.0, + "completions/max_terminated_length": 2372.0, + "completions/mean_length": 1455.261962890625, + "completions/mean_terminated_length": 1455.261962890625, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "epoch": 0.8099688473520249, + "grad_norm": 0.5824243426322937, + "kl": 0.05402742512524128, + "learning_rate": 1.6818749999999999e-06, + "loss": 0.0003, + "num_tokens": 67144684.0, + "reward": 1.4728202819824219, + "reward_std": 0.07268865406513214, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.48472505807876587, + "rewards/correct_reward_func/std": 0.1330195963382721, + "step": 520 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1930.0, + "completions/max_terminated_length": 1930.0, + "completions/mean_length": 1356.7738037109375, + "completions/mean_terminated_length": 1356.7738037109375, + "completions/min_length": 699.0, + "completions/min_terminated_length": 699.0, + "epoch": 0.8115264797507789, + "grad_norm": 0.635526716709137, + "kl": 0.052402498200535774, + "learning_rate": 1.6812499999999998e-06, + "loss": -0.0018, + "num_tokens": 67264455.0, + "reward": 1.4374048709869385, + "reward_std": 0.10275428742170334, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44930967688560486, + "rewards/correct_reward_func/std": 0.12510575354099274, + "step": 521 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2142.0, + "completions/max_terminated_length": 2142.0, + "completions/mean_length": 1410.011962890625, + "completions/mean_terminated_length": 1410.011962890625, + "completions/min_length": 785.0, + "completions/min_terminated_length": 785.0, + "epoch": 0.8130841121495327, + "grad_norm": 0.6527436971664429, + "kl": 0.061864860355854034, + "learning_rate": 1.680625e-06, + "loss": -0.007, + "num_tokens": 67388722.0, + "reward": 1.4815458059310913, + "reward_std": 0.06229028478264809, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.481545627117157, + "rewards/correct_reward_func/std": 0.18914058804512024, + "step": 522 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2120.0, + "completions/max_terminated_length": 2120.0, + "completions/mean_length": 1409.8809814453125, + "completions/mean_terminated_length": 1409.8809814453125, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.8146417445482866, + "grad_norm": 0.5648652911186218, + "kl": 0.05158809758722782, + "learning_rate": 1.6799999999999998e-06, + "loss": -0.0055, + "num_tokens": 67513224.0, + "reward": 1.4905917644500732, + "reward_std": 0.0897277295589447, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5024964213371277, + "rewards/correct_reward_func/std": 0.1823735535144806, + "step": 523 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2069.0, + "completions/max_terminated_length": 2069.0, + "completions/mean_length": 1410.6190185546875, + "completions/mean_terminated_length": 1410.6190185546875, + "completions/min_length": 615.0, + "completions/min_terminated_length": 615.0, + "epoch": 0.8161993769470405, + "grad_norm": 0.601411759853363, + "kl": 0.0548630990087986, + "learning_rate": 1.679375e-06, + "loss": 0.0227, + "num_tokens": 67637728.0, + "reward": 1.4363572597503662, + "reward_std": 0.08451084047555923, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44826188683509827, + "rewards/correct_reward_func/std": 0.17532704770565033, + "step": 524 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2041.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 1413.547607421875, + "completions/mean_terminated_length": 1413.547607421875, + "completions/min_length": 863.0, + "completions/min_terminated_length": 863.0, + "epoch": 0.8177570093457944, + "grad_norm": 0.619880735874176, + "kl": 0.0535897146910429, + "learning_rate": 1.6787499999999998e-06, + "loss": 0.0125, + "num_tokens": 67762508.0, + "reward": 1.4487403631210327, + "reward_std": 0.0894000232219696, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46064507961273193, + "rewards/correct_reward_func/std": 0.14411649107933044, + "step": 525 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2225.0, + "completions/max_terminated_length": 2225.0, + "completions/mean_length": 1396.892822265625, + "completions/mean_terminated_length": 1396.892822265625, + "completions/min_length": 208.0, + "completions/min_terminated_length": 208.0, + "epoch": 0.8193146417445483, + "grad_norm": 0.5552582144737244, + "kl": 0.052618470042943954, + "learning_rate": 1.678125e-06, + "loss": 0.0085, + "num_tokens": 67885985.0, + "reward": 1.513651967048645, + "reward_std": 0.07783416658639908, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5255565643310547, + "rewards/correct_reward_func/std": 0.18098370730876923, + "step": 526 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2145.0, + "completions/max_terminated_length": 2145.0, + "completions/mean_length": 1407.4285888671875, + "completions/mean_terminated_length": 1407.4285888671875, + "completions/min_length": 803.0, + "completions/min_terminated_length": 803.0, + "epoch": 0.8208722741433022, + "grad_norm": 0.5891075730323792, + "kl": 0.05191943235695362, + "learning_rate": 1.6774999999999998e-06, + "loss": -0.0255, + "num_tokens": 68010101.0, + "reward": 1.531822919845581, + "reward_std": 0.0646144449710846, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5318229794502258, + "rewards/correct_reward_func/std": 0.13647998869419098, + "step": 527 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2089.0, + "completions/max_terminated_length": 2089.0, + "completions/mean_length": 1330.71435546875, + "completions/mean_terminated_length": 1330.71435546875, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "epoch": 0.822429906542056, + "grad_norm": 0.6207326054573059, + "kl": 0.05244195647537708, + "learning_rate": 1.6768749999999997e-06, + "loss": -0.0223, + "num_tokens": 68127731.0, + "reward": 1.5092251300811768, + "reward_std": 0.05685145780444145, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5092251300811768, + "rewards/correct_reward_func/std": 0.1656326800584793, + "step": 528 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2121.0, + "completions/max_terminated_length": 2121.0, + "completions/mean_length": 1391.7381591796875, + "completions/mean_terminated_length": 1391.7381591796875, + "completions/min_length": 651.0, + "completions/min_terminated_length": 651.0, + "epoch": 0.82398753894081, + "grad_norm": 0.5846036076545715, + "kl": 0.05157465487718582, + "learning_rate": 1.67625e-06, + "loss": 0.0272, + "num_tokens": 68250595.0, + "reward": 1.480483889579773, + "reward_std": 0.06157321855425835, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4804837703704834, + "rewards/correct_reward_func/std": 0.12125560641288757, + "step": 529 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2016.0, + "completions/mean_length": 1558.21435546875, + "completions/mean_terminated_length": 1478.2890625, + "completions/min_length": 859.0, + "completions/min_terminated_length": 859.0, + "epoch": 0.8255451713395638, + "grad_norm": 0.5714659094810486, + "kl": 0.04993342235684395, + "learning_rate": 1.675625e-06, + "loss": 0.0461, + "num_tokens": 68387617.0, + "reward": 1.4617350101470947, + "reward_std": 0.062348198145627975, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4617350101470947, + "rewards/correct_reward_func/std": 0.1364985555410385, + "step": 530 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2211.0, + "completions/max_terminated_length": 2211.0, + "completions/mean_length": 1433.452392578125, + "completions/mean_terminated_length": 1433.452392578125, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "epoch": 0.8271028037383178, + "grad_norm": 0.5750571489334106, + "kl": 0.05183848738670349, + "learning_rate": 1.675e-06, + "loss": -0.012, + "num_tokens": 68513937.0, + "reward": 1.5087705850601196, + "reward_std": 0.07611233741044998, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5087705254554749, + "rewards/correct_reward_func/std": 0.17911018431186676, + "step": 531 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2028.0, + "completions/max_terminated_length": 2028.0, + "completions/mean_length": 1391.5357666015625, + "completions/mean_terminated_length": 1391.5357666015625, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "epoch": 0.8286604361370716, + "grad_norm": 0.6533128023147583, + "kl": 0.05436134710907936, + "learning_rate": 1.674375e-06, + "loss": 0.0264, + "num_tokens": 68636850.0, + "reward": 1.51654851436615, + "reward_std": 0.09789982438087463, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5284532904624939, + "rewards/correct_reward_func/std": 0.15198057889938354, + "step": 532 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2412.0, + "completions/max_terminated_length": 2412.0, + "completions/mean_length": 1433.857177734375, + "completions/mean_terminated_length": 1433.857177734375, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "epoch": 0.8302180685358256, + "grad_norm": 0.5955455303192139, + "kl": 0.055306799709796906, + "learning_rate": 1.67375e-06, + "loss": -0.0119, + "num_tokens": 68763192.0, + "reward": 1.379233717918396, + "reward_std": 0.04899342358112335, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.37923356890678406, + "rewards/correct_reward_func/std": 0.1353388875722885, + "step": 533 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2564.0, + "completions/max_terminated_length": 2564.0, + "completions/mean_length": 1472.8095703125, + "completions/mean_terminated_length": 1472.8095703125, + "completions/min_length": 511.0, + "completions/min_terminated_length": 511.0, + "epoch": 0.8317757009345794, + "grad_norm": 0.5779551267623901, + "kl": 0.05130494572222233, + "learning_rate": 1.673125e-06, + "loss": 0.0082, + "num_tokens": 68893040.0, + "reward": 1.4051659107208252, + "reward_std": 0.049869608134031296, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4051658511161804, + "rewards/correct_reward_func/std": 0.14183004200458527, + "step": 534 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2026.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1413.2857666015625, + "completions/mean_terminated_length": 1413.2857666015625, + "completions/min_length": 855.0, + "completions/min_terminated_length": 855.0, + "epoch": 0.8333333333333334, + "grad_norm": 0.6209075450897217, + "kl": 0.05292078107595444, + "learning_rate": 1.6725e-06, + "loss": -0.0059, + "num_tokens": 69017690.0, + "reward": 1.5440659523010254, + "reward_std": 0.06486238539218903, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5440659523010254, + "rewards/correct_reward_func/std": 0.1593974381685257, + "step": 535 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2176.0, + "completions/max_terminated_length": 2176.0, + "completions/mean_length": 1482.3809814453125, + "completions/mean_terminated_length": 1482.3809814453125, + "completions/min_length": 914.0, + "completions/min_terminated_length": 914.0, + "epoch": 0.8348909657320872, + "grad_norm": 0.6003454923629761, + "kl": 0.05392787978053093, + "learning_rate": 1.671875e-06, + "loss": 0.0123, + "num_tokens": 69148084.0, + "reward": 1.436340093612671, + "reward_std": 0.07149424403905869, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4363400638103485, + "rewards/correct_reward_func/std": 0.14438651502132416, + "step": 536 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2110.0, + "completions/max_terminated_length": 2110.0, + "completions/mean_length": 1409.46435546875, + "completions/mean_terminated_length": 1409.46435546875, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "epoch": 0.8364485981308412, + "grad_norm": 0.6742749214172363, + "kl": 0.053148942068219185, + "learning_rate": 1.6712499999999999e-06, + "loss": -0.0079, + "num_tokens": 69272203.0, + "reward": 1.4142444133758545, + "reward_std": 0.10058359056711197, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.43805375695228577, + "rewards/correct_reward_func/std": 0.17819122970104218, + "step": 537 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2199.0, + "completions/mean_length": 1534.0833740234375, + "completions/mean_terminated_length": 1453.867431640625, + "completions/min_length": 861.0, + "completions/min_terminated_length": 861.0, + "epoch": 0.838006230529595, + "grad_norm": 0.572339653968811, + "kl": 0.051896609365940094, + "learning_rate": 1.670625e-06, + "loss": 0.0649, + "num_tokens": 69406928.0, + "reward": 1.4281702041625977, + "reward_std": 0.08568203449249268, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44007474184036255, + "rewards/correct_reward_func/std": 0.1391279697418213, + "step": 538 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3857.0, + "completions/max_terminated_length": 3857.0, + "completions/mean_length": 1528.0, + "completions/mean_terminated_length": 1528.0, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "epoch": 0.839563862928349, + "grad_norm": 0.558670699596405, + "kl": 0.05065500736236572, + "learning_rate": 1.6699999999999999e-06, + "loss": 0.0049, + "num_tokens": 69541424.0, + "reward": 1.416330337524414, + "reward_std": 0.12241604179143906, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4401398301124573, + "rewards/correct_reward_func/std": 0.12226840853691101, + "step": 539 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2426.0, + "completions/max_terminated_length": 2426.0, + "completions/mean_length": 1439.107177734375, + "completions/mean_terminated_length": 1439.107177734375, + "completions/min_length": 756.0, + "completions/min_terminated_length": 756.0, + "epoch": 0.8411214953271028, + "grad_norm": 0.6253755688667297, + "kl": 0.05376381799578667, + "learning_rate": 1.669375e-06, + "loss": 0.0018, + "num_tokens": 69668123.0, + "reward": 1.4674383401870728, + "reward_std": 0.0582718625664711, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4674382507801056, + "rewards/correct_reward_func/std": 0.17001482844352722, + "step": 540 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2122.0, + "completions/max_terminated_length": 2122.0, + "completions/mean_length": 1451.797607421875, + "completions/mean_terminated_length": 1451.797607421875, + "completions/min_length": 980.0, + "completions/min_terminated_length": 980.0, + "epoch": 0.8426791277258567, + "grad_norm": 0.5441909432411194, + "kl": 0.05154600366950035, + "learning_rate": 1.66875e-06, + "loss": -0.0146, + "num_tokens": 69795798.0, + "reward": 1.5086621046066284, + "reward_std": 0.045781608670949936, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5086619853973389, + "rewards/correct_reward_func/std": 0.15602315962314606, + "step": 541 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2118.0, + "completions/max_terminated_length": 2118.0, + "completions/mean_length": 1410.3809814453125, + "completions/mean_terminated_length": 1410.3809814453125, + "completions/min_length": 894.0, + "completions/min_terminated_length": 894.0, + "epoch": 0.8442367601246106, + "grad_norm": 0.5914663076400757, + "kl": 0.053898200392723083, + "learning_rate": 1.668125e-06, + "loss": -0.02, + "num_tokens": 69920132.0, + "reward": 1.4829809665679932, + "reward_std": 0.05561475455760956, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4829808175563812, + "rewards/correct_reward_func/std": 0.14528672397136688, + "step": 542 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2454.0, + "completions/max_terminated_length": 2454.0, + "completions/mean_length": 1559.607177734375, + "completions/mean_terminated_length": 1559.607177734375, + "completions/min_length": 901.0, + "completions/min_terminated_length": 901.0, + "epoch": 0.8457943925233645, + "grad_norm": 0.5803366303443909, + "kl": 0.05089765228331089, + "learning_rate": 1.6675e-06, + "loss": 0.024, + "num_tokens": 70057187.0, + "reward": 1.5257433652877808, + "reward_std": 0.07609397917985916, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.525743305683136, + "rewards/correct_reward_func/std": 0.15661990642547607, + "step": 543 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2234.0, + "completions/max_terminated_length": 2234.0, + "completions/mean_length": 1478.202392578125, + "completions/mean_terminated_length": 1478.202392578125, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.8473520249221184, + "grad_norm": 0.631567120552063, + "kl": 0.051157766953110695, + "learning_rate": 1.666875e-06, + "loss": -0.0001, + "num_tokens": 70187470.0, + "reward": 1.4295824766159058, + "reward_std": 0.06356283277273178, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.42958250641822815, + "rewards/correct_reward_func/std": 0.164540097117424, + "step": 544 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2417.0, + "completions/max_terminated_length": 2417.0, + "completions/mean_length": 1515.0238037109375, + "completions/mean_terminated_length": 1515.0238037109375, + "completions/min_length": 935.0, + "completions/min_terminated_length": 935.0, + "epoch": 0.8489096573208723, + "grad_norm": 0.5900546908378601, + "kl": 0.05033543519675732, + "learning_rate": 1.66625e-06, + "loss": -0.0113, + "num_tokens": 70320696.0, + "reward": 1.4821819067001343, + "reward_std": 0.0709303766489029, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4940865933895111, + "rewards/correct_reward_func/std": 0.1473175436258316, + "step": 545 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2307.0, + "completions/max_terminated_length": 2307.0, + "completions/mean_length": 1536.761962890625, + "completions/mean_terminated_length": 1536.761962890625, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "epoch": 0.8504672897196262, + "grad_norm": 0.5679816007614136, + "kl": 0.05010136775672436, + "learning_rate": 1.6656249999999998e-06, + "loss": -0.0173, + "num_tokens": 70455964.0, + "reward": 1.5630290508270264, + "reward_std": 0.06558456271886826, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5630288124084473, + "rewards/correct_reward_func/std": 0.16733138263225555, + "step": 546 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2270.0, + "completions/max_terminated_length": 2270.0, + "completions/mean_length": 1501.6429443359375, + "completions/mean_terminated_length": 1501.6429443359375, + "completions/min_length": 979.0, + "completions/min_terminated_length": 979.0, + "epoch": 0.8520249221183801, + "grad_norm": 0.5807083249092102, + "kl": 0.05040537752211094, + "learning_rate": 1.665e-06, + "loss": 0.0092, + "num_tokens": 70588168.0, + "reward": 1.4890462160110474, + "reward_std": 0.050529684871435165, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.489046186208725, + "rewards/correct_reward_func/std": 0.11799110472202301, + "step": 547 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2537.0, + "completions/max_terminated_length": 2537.0, + "completions/mean_length": 1504.90478515625, + "completions/mean_terminated_length": 1504.90478515625, + "completions/min_length": 844.0, + "completions/min_terminated_length": 844.0, + "epoch": 0.8535825545171339, + "grad_norm": 0.5985187292098999, + "kl": 0.0527173038572073, + "learning_rate": 1.6643749999999998e-06, + "loss": 0.0376, + "num_tokens": 70720526.0, + "reward": 1.4671262502670288, + "reward_std": 0.09527470171451569, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4790307879447937, + "rewards/correct_reward_func/std": 0.2005491405725479, + "step": 548 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2135.0, + "completions/max_terminated_length": 2135.0, + "completions/mean_length": 1473.0, + "completions/mean_terminated_length": 1473.0, + "completions/min_length": 635.0, + "completions/min_terminated_length": 635.0, + "epoch": 0.8551401869158879, + "grad_norm": 0.6216426491737366, + "kl": 0.0518038310110569, + "learning_rate": 1.66375e-06, + "loss": 0.002, + "num_tokens": 70850270.0, + "reward": 1.4148383140563965, + "reward_std": 0.13262715935707092, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4386478662490845, + "rewards/correct_reward_func/std": 0.1450170874595642, + "step": 549 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2297.0, + "completions/mean_length": 1617.7738037109375, + "completions/mean_terminated_length": 1538.566162109375, + "completions/min_length": 1021.0, + "completions/min_terminated_length": 1021.0, + "epoch": 0.8566978193146417, + "grad_norm": 0.5692617893218994, + "kl": 0.04915030300617218, + "learning_rate": 1.6631249999999999e-06, + "loss": 0.0763, + "num_tokens": 70992001.0, + "reward": 1.4859832525253296, + "reward_std": 0.08638235181570053, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.49788784980773926, + "rewards/correct_reward_func/std": 0.1480027735233307, + "step": 550 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2063.0, + "completions/mean_length": 1563.7261962890625, + "completions/mean_terminated_length": 1483.867431640625, + "completions/min_length": 1052.0, + "completions/min_terminated_length": 1052.0, + "epoch": 0.8582554517133957, + "grad_norm": 0.5355828404426575, + "kl": 0.04826325178146362, + "learning_rate": 1.6625e-06, + "loss": 0.0394, + "num_tokens": 71129288.0, + "reward": 1.4876452684402466, + "reward_std": 0.05155961960554123, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4876452386379242, + "rewards/correct_reward_func/std": 0.17587290704250336, + "step": 551 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2185.0, + "completions/max_terminated_length": 2185.0, + "completions/mean_length": 1465.4761962890625, + "completions/mean_terminated_length": 1465.4761962890625, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "epoch": 0.8598130841121495, + "grad_norm": 0.6027195453643799, + "kl": 0.051084551960229874, + "learning_rate": 1.6618749999999999e-06, + "loss": -0.0266, + "num_tokens": 71258208.0, + "reward": 1.48594069480896, + "reward_std": 0.06709278374910355, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4859406650066376, + "rewards/correct_reward_func/std": 0.16938088834285736, + "step": 552 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4262.0, + "completions/max_terminated_length": 4262.0, + "completions/mean_length": 1517.666748046875, + "completions/mean_terminated_length": 1517.666748046875, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "epoch": 0.8613707165109035, + "grad_norm": 0.580878734588623, + "kl": 0.04972629249095917, + "learning_rate": 1.6612499999999998e-06, + "loss": -0.0144, + "num_tokens": 71391566.0, + "reward": 1.4727210998535156, + "reward_std": 0.06501049548387527, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4727211892604828, + "rewards/correct_reward_func/std": 0.17857278883457184, + "step": 553 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1985.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 1515.7857666015625, + "completions/mean_terminated_length": 1515.7857666015625, + "completions/min_length": 840.0, + "completions/min_terminated_length": 840.0, + "epoch": 0.8629283489096573, + "grad_norm": 0.599119246006012, + "kl": 0.050645509734749794, + "learning_rate": 1.6606249999999999e-06, + "loss": -0.0003, + "num_tokens": 71524916.0, + "reward": 1.4626071453094482, + "reward_std": 0.053476471453905106, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46260714530944824, + "rewards/correct_reward_func/std": 0.16813401877880096, + "step": 554 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5913.0, + "completions/max_terminated_length": 5913.0, + "completions/mean_length": 1586.047607421875, + "completions/mean_terminated_length": 1586.047607421875, + "completions/min_length": 1029.0, + "completions/min_terminated_length": 1029.0, + "epoch": 0.8644859813084113, + "grad_norm": 0.547502338886261, + "kl": 0.04839299060404301, + "learning_rate": 1.6599999999999998e-06, + "loss": 0.0048, + "num_tokens": 71664126.0, + "reward": 1.568305253982544, + "reward_std": 0.08509069681167603, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5802100300788879, + "rewards/correct_reward_func/std": 0.1785019338130951, + "step": 555 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2329.0, + "completions/mean_length": 1738.5238037109375, + "completions/mean_terminated_length": 1581.1219482421875, + "completions/min_length": 759.0, + "completions/min_terminated_length": 759.0, + "epoch": 0.8660436137071651, + "grad_norm": 0.5003688931465149, + "kl": 0.047175006940960884, + "learning_rate": 1.6593749999999999e-06, + "loss": 0.072, + "num_tokens": 71816114.0, + "reward": 1.487537145614624, + "reward_std": 0.078678660094738, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4875370264053345, + "rewards/correct_reward_func/std": 0.17857803404331207, + "step": 556 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2373.0, + "completions/max_terminated_length": 2373.0, + "completions/mean_length": 1579.0595703125, + "completions/mean_terminated_length": 1579.0595703125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "epoch": 0.867601246105919, + "grad_norm": 0.583847165107727, + "kl": 0.04886885918676853, + "learning_rate": 1.6587499999999998e-06, + "loss": 0.0038, + "num_tokens": 71954809.0, + "reward": 1.4350553750991821, + "reward_std": 0.06607770174741745, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43505528569221497, + "rewards/correct_reward_func/std": 0.10001393407583237, + "step": 557 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2380.0, + "completions/max_terminated_length": 2380.0, + "completions/mean_length": 1577.0, + "completions/mean_terminated_length": 1577.0, + "completions/min_length": 924.0, + "completions/min_terminated_length": 924.0, + "epoch": 0.8691588785046729, + "grad_norm": 0.5692570805549622, + "kl": 0.05144515633583069, + "learning_rate": 1.658125e-06, + "loss": -0.0243, + "num_tokens": 72093283.0, + "reward": 1.4471040964126587, + "reward_std": 0.06054630130529404, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44710394740104675, + "rewards/correct_reward_func/std": 0.1185460090637207, + "step": 558 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2167.0, + "completions/mean_length": 1629.9285888671875, + "completions/mean_terminated_length": 1550.867431640625, + "completions/min_length": 830.0, + "completions/min_terminated_length": 830.0, + "epoch": 0.8707165109034268, + "grad_norm": 0.5377056002616882, + "kl": 0.049677252769470215, + "learning_rate": 1.6574999999999998e-06, + "loss": 0.0604, + "num_tokens": 72236281.0, + "reward": 1.4650782346725464, + "reward_std": 0.08549048751592636, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4650781750679016, + "rewards/correct_reward_func/std": 0.13779646158218384, + "step": 559 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2994.0, + "completions/max_terminated_length": 2994.0, + "completions/mean_length": 1595.09521484375, + "completions/mean_terminated_length": 1595.09521484375, + "completions/min_length": 921.0, + "completions/min_terminated_length": 921.0, + "epoch": 0.8722741433021807, + "grad_norm": 0.5444644689559937, + "kl": 0.053564492613077164, + "learning_rate": 1.6568750000000001e-06, + "loss": -0.0056, + "num_tokens": 72376257.0, + "reward": 1.464841604232788, + "reward_std": 0.07330876588821411, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4648415148258209, + "rewards/correct_reward_func/std": 0.13325951993465424, + "step": 560 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2346.0, + "completions/max_terminated_length": 2346.0, + "completions/mean_length": 1595.761962890625, + "completions/mean_terminated_length": 1595.761962890625, + "completions/min_length": 1008.0, + "completions/min_terminated_length": 1008.0, + "epoch": 0.8738317757009346, + "grad_norm": 0.5756959915161133, + "kl": 0.0488431490957737, + "learning_rate": 1.65625e-06, + "loss": -0.0016, + "num_tokens": 72516427.0, + "reward": 1.5044291019439697, + "reward_std": 0.05856965854763985, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5044289827346802, + "rewards/correct_reward_func/std": 0.16118833422660828, + "step": 561 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2420.0, + "completions/max_terminated_length": 2420.0, + "completions/mean_length": 1523.202392578125, + "completions/mean_terminated_length": 1523.202392578125, + "completions/min_length": 905.0, + "completions/min_terminated_length": 905.0, + "epoch": 0.8753894080996885, + "grad_norm": 0.6184421181678772, + "kl": 0.051321882754564285, + "learning_rate": 1.655625e-06, + "loss": 0.0021, + "num_tokens": 72650334.0, + "reward": 1.439370036125183, + "reward_std": 0.12014901638031006, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4512746334075928, + "rewards/correct_reward_func/std": 0.13923780620098114, + "step": 562 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2735.0, + "completions/max_terminated_length": 2735.0, + "completions/mean_length": 1580.0595703125, + "completions/mean_terminated_length": 1580.0595703125, + "completions/min_length": 1087.0, + "completions/min_terminated_length": 1087.0, + "epoch": 0.8769470404984424, + "grad_norm": 0.5475049018859863, + "kl": 0.05060616135597229, + "learning_rate": 1.655e-06, + "loss": 0.0094, + "num_tokens": 72789155.0, + "reward": 1.4986872673034668, + "reward_std": 0.041443560272455215, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4986870288848877, + "rewards/correct_reward_func/std": 0.1334153264760971, + "step": 563 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2607.0, + "completions/mean_length": 1658.9881591796875, + "completions/mean_terminated_length": 1580.277099609375, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "epoch": 0.8785046728971962, + "grad_norm": 0.5355792045593262, + "kl": 0.0508806686848402, + "learning_rate": 1.654375e-06, + "loss": -0.0215, + "num_tokens": 72934552.0, + "reward": 1.425096035003662, + "reward_std": 0.0804082602262497, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4370007812976837, + "rewards/correct_reward_func/std": 0.12526416778564453, + "step": 564 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2914.0, + "completions/max_terminated_length": 2914.0, + "completions/mean_length": 1626.2857666015625, + "completions/mean_terminated_length": 1626.2857666015625, + "completions/min_length": 1013.0, + "completions/min_terminated_length": 1013.0, + "epoch": 0.8800623052959502, + "grad_norm": 0.5639548897743225, + "kl": 0.05000521242618561, + "learning_rate": 1.65375e-06, + "loss": 0.0054, + "num_tokens": 73077196.0, + "reward": 1.4919753074645996, + "reward_std": 0.06127806007862091, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49197524785995483, + "rewards/correct_reward_func/std": 0.1505451202392578, + "step": 565 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2300.0, + "completions/max_terminated_length": 2300.0, + "completions/mean_length": 1541.8690185546875, + "completions/mean_terminated_length": 1541.8690185546875, + "completions/min_length": 839.0, + "completions/min_terminated_length": 839.0, + "epoch": 0.881619937694704, + "grad_norm": 0.5418336987495422, + "kl": 0.04926094599068165, + "learning_rate": 1.653125e-06, + "loss": -0.0078, + "num_tokens": 73212797.0, + "reward": 1.5520154237747192, + "reward_std": 0.06705118715763092, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5520154237747192, + "rewards/correct_reward_func/std": 0.22710636258125305, + "step": 566 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2350.0, + "completions/max_terminated_length": 2350.0, + "completions/mean_length": 1580.0357666015625, + "completions/mean_terminated_length": 1580.0357666015625, + "completions/min_length": 948.0, + "completions/min_terminated_length": 948.0, + "epoch": 0.883177570093458, + "grad_norm": 0.5915489196777344, + "kl": 0.05472877249121666, + "learning_rate": 1.6525e-06, + "loss": -0.0068, + "num_tokens": 73351538.0, + "reward": 1.450407862663269, + "reward_std": 0.10079541802406311, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.47421735525131226, + "rewards/correct_reward_func/std": 0.12341609597206116, + "step": 567 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2387.0, + "completions/max_terminated_length": 2387.0, + "completions/mean_length": 1513.6905517578125, + "completions/mean_terminated_length": 1513.6905517578125, + "completions/min_length": 1000.0, + "completions/min_terminated_length": 1000.0, + "epoch": 0.8847352024922118, + "grad_norm": 0.600501537322998, + "kl": 0.05078642629086971, + "learning_rate": 1.651875e-06, + "loss": -0.0143, + "num_tokens": 73484532.0, + "reward": 1.485834002494812, + "reward_std": 0.07049893587827682, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4858340322971344, + "rewards/correct_reward_func/std": 0.16757294535636902, + "step": 568 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3088.0, + "completions/max_terminated_length": 3088.0, + "completions/mean_length": 1561.09521484375, + "completions/mean_terminated_length": 1561.09521484375, + "completions/min_length": 974.0, + "completions/min_terminated_length": 974.0, + "epoch": 0.8862928348909658, + "grad_norm": 0.641735851764679, + "kl": 0.052700335159897804, + "learning_rate": 1.65125e-06, + "loss": -0.0118, + "num_tokens": 73621460.0, + "reward": 1.4019627571105957, + "reward_std": 0.089509978890419, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4138675034046173, + "rewards/correct_reward_func/std": 0.20820264518260956, + "step": 569 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2686.0, + "completions/max_terminated_length": 2686.0, + "completions/mean_length": 1680.702392578125, + "completions/mean_terminated_length": 1680.702392578125, + "completions/min_length": 1087.0, + "completions/min_terminated_length": 1087.0, + "epoch": 0.8878504672897196, + "grad_norm": 0.5752266049385071, + "kl": 0.05081222578883171, + "learning_rate": 1.650625e-06, + "loss": 0.001, + "num_tokens": 73768867.0, + "reward": 1.428143858909607, + "reward_std": 0.08753962814807892, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.440048485994339, + "rewards/correct_reward_func/std": 0.14757801592350006, + "step": 570 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2271.0, + "completions/max_terminated_length": 2271.0, + "completions/mean_length": 1525.011962890625, + "completions/mean_terminated_length": 1525.011962890625, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "epoch": 0.8894080996884736, + "grad_norm": 0.5790801644325256, + "kl": 0.050931330770254135, + "learning_rate": 1.6499999999999999e-06, + "loss": -0.0014, + "num_tokens": 73902932.0, + "reward": 1.5280438661575317, + "reward_std": 0.07709922641515732, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5399484634399414, + "rewards/correct_reward_func/std": 0.19332382082939148, + "step": 571 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2723.0, + "completions/max_terminated_length": 2723.0, + "completions/mean_length": 1548.2857666015625, + "completions/mean_terminated_length": 1548.2857666015625, + "completions/min_length": 814.0, + "completions/min_terminated_length": 814.0, + "epoch": 0.8909657320872274, + "grad_norm": 0.5646366477012634, + "kl": 0.05197379179298878, + "learning_rate": 1.649375e-06, + "loss": 0.0077, + "num_tokens": 74038952.0, + "reward": 1.399301290512085, + "reward_std": 0.0882752537727356, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4112059772014618, + "rewards/correct_reward_func/std": 0.13882336020469666, + "step": 572 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2738.0, + "completions/max_terminated_length": 2738.0, + "completions/mean_length": 1620.15478515625, + "completions/mean_terminated_length": 1620.15478515625, + "completions/min_length": 1027.0, + "completions/min_terminated_length": 1027.0, + "epoch": 0.8925233644859814, + "grad_norm": 0.5650038719177246, + "kl": 0.0501062236726284, + "learning_rate": 1.6487499999999999e-06, + "loss": 0.0136, + "num_tokens": 74180907.0, + "reward": 1.4877279996871948, + "reward_std": 0.06202785298228264, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48772794008255005, + "rewards/correct_reward_func/std": 0.1389545202255249, + "step": 573 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2306.0, + "completions/max_terminated_length": 2306.0, + "completions/mean_length": 1473.297607421875, + "completions/mean_terminated_length": 1473.297607421875, + "completions/min_length": 799.0, + "completions/min_terminated_length": 799.0, + "epoch": 0.8940809968847352, + "grad_norm": 0.5704150199890137, + "kl": 0.05233505181968212, + "learning_rate": 1.648125e-06, + "loss": -0.0144, + "num_tokens": 74310478.0, + "reward": 1.524600625038147, + "reward_std": 0.10687962174415588, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.536505401134491, + "rewards/correct_reward_func/std": 0.17535418272018433, + "step": 574 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2972.0, + "completions/max_terminated_length": 2972.0, + "completions/mean_length": 1529.8095703125, + "completions/mean_terminated_length": 1529.8095703125, + "completions/min_length": 751.0, + "completions/min_terminated_length": 751.0, + "epoch": 0.8956386292834891, + "grad_norm": 0.583591878414154, + "kl": 0.05231664888560772, + "learning_rate": 1.6475e-06, + "loss": -0.0175, + "num_tokens": 74444784.0, + "reward": 1.4119700193405151, + "reward_std": 0.08519253879785538, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.42387470602989197, + "rewards/correct_reward_func/std": 0.1410830169916153, + "step": 575 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3040.0, + "completions/mean_length": 1685.96435546875, + "completions/mean_terminated_length": 1607.5782470703125, + "completions/min_length": 1009.0, + "completions/min_terminated_length": 1009.0, + "epoch": 0.897196261682243, + "grad_norm": 0.5724698901176453, + "kl": 0.05051821656525135, + "learning_rate": 1.646875e-06, + "loss": 0.0665, + "num_tokens": 74592453.0, + "reward": 1.543835997581482, + "reward_std": 0.059640318155288696, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5438359379768372, + "rewards/correct_reward_func/std": 0.19233205914497375, + "step": 576 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2392.0, + "completions/max_terminated_length": 2392.0, + "completions/mean_length": 1596.0, + "completions/mean_terminated_length": 1596.0, + "completions/min_length": 1028.0, + "completions/min_terminated_length": 1028.0, + "epoch": 0.8987538940809969, + "grad_norm": 0.5430770516395569, + "kl": 0.05068780109286308, + "learning_rate": 1.64625e-06, + "loss": 0.003, + "num_tokens": 74732313.0, + "reward": 1.4892923831939697, + "reward_std": 0.045956652611494064, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4892924129962921, + "rewards/correct_reward_func/std": 0.13337074220180511, + "step": 577 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2648.0, + "completions/max_terminated_length": 2648.0, + "completions/mean_length": 1619.3333740234375, + "completions/mean_terminated_length": 1619.3333740234375, + "completions/min_length": 1108.0, + "completions/min_terminated_length": 1108.0, + "epoch": 0.9003115264797508, + "grad_norm": 0.5669228434562683, + "kl": 0.05153697915375233, + "learning_rate": 1.6456249999999998e-06, + "loss": -0.0006, + "num_tokens": 74874247.0, + "reward": 1.4417579174041748, + "reward_std": 0.08493451774120331, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.453662633895874, + "rewards/correct_reward_func/std": 0.14506934583187103, + "step": 578 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2490.0, + "completions/max_terminated_length": 2490.0, + "completions/mean_length": 1587.84521484375, + "completions/mean_terminated_length": 1587.84521484375, + "completions/min_length": 998.0, + "completions/min_terminated_length": 998.0, + "epoch": 0.9018691588785047, + "grad_norm": 0.5580478310585022, + "kl": 0.0523674376308918, + "learning_rate": 1.645e-06, + "loss": 0.018, + "num_tokens": 75013500.0, + "reward": 1.4614784717559814, + "reward_std": 0.06435896456241608, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46147841215133667, + "rewards/correct_reward_func/std": 0.11859949678182602, + "step": 579 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3307.0, + "completions/max_terminated_length": 3307.0, + "completions/mean_length": 1637.8214111328125, + "completions/mean_terminated_length": 1637.8214111328125, + "completions/min_length": 1092.0, + "completions/min_terminated_length": 1092.0, + "epoch": 0.9034267912772586, + "grad_norm": 0.540749192237854, + "kl": 0.05217336490750313, + "learning_rate": 1.6443749999999998e-06, + "loss": 0.005, + "num_tokens": 75157203.0, + "reward": 1.4611413478851318, + "reward_std": 0.10950693488121033, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4849506914615631, + "rewards/correct_reward_func/std": 0.16065186262130737, + "step": 580 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2337.0, + "completions/max_terminated_length": 2337.0, + "completions/mean_length": 1549.2261962890625, + "completions/mean_terminated_length": 1549.2261962890625, + "completions/min_length": 938.0, + "completions/min_terminated_length": 938.0, + "epoch": 0.9049844236760125, + "grad_norm": 0.6054486632347107, + "kl": 0.05162344500422478, + "learning_rate": 1.64375e-06, + "loss": 0.0224, + "num_tokens": 75293266.0, + "reward": 1.4400663375854492, + "reward_std": 0.048954516649246216, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4400663375854492, + "rewards/correct_reward_func/std": 0.1594221442937851, + "step": 581 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2236.0, + "completions/max_terminated_length": 2236.0, + "completions/mean_length": 1517.6785888671875, + "completions/mean_terminated_length": 1517.6785888671875, + "completions/min_length": 896.0, + "completions/min_terminated_length": 896.0, + "epoch": 0.9065420560747663, + "grad_norm": 0.5864580273628235, + "kl": 0.052071839570999146, + "learning_rate": 1.6431249999999998e-06, + "loss": 0.0076, + "num_tokens": 75426745.0, + "reward": 1.4933863878250122, + "reward_std": 0.054050467908382416, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4933864176273346, + "rewards/correct_reward_func/std": 0.17375320196151733, + "step": 582 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2700.0, + "completions/max_terminated_length": 2700.0, + "completions/mean_length": 1574.7857666015625, + "completions/mean_terminated_length": 1574.7857666015625, + "completions/min_length": 740.0, + "completions/min_terminated_length": 740.0, + "epoch": 0.9080996884735203, + "grad_norm": 0.5605589747428894, + "kl": 0.05352449230849743, + "learning_rate": 1.6425e-06, + "loss": -0.0052, + "num_tokens": 75565075.0, + "reward": 1.488409399986267, + "reward_std": 0.11526400595903397, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5122188329696655, + "rewards/correct_reward_func/std": 0.14258237183094025, + "step": 583 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2662.0, + "completions/max_terminated_length": 2662.0, + "completions/mean_length": 1574.011962890625, + "completions/mean_terminated_length": 1574.011962890625, + "completions/min_length": 889.0, + "completions/min_terminated_length": 889.0, + "epoch": 0.9096573208722741, + "grad_norm": 0.5379892587661743, + "kl": 0.05358175188302994, + "learning_rate": 1.6418749999999998e-06, + "loss": -0.0107, + "num_tokens": 75703280.0, + "reward": 1.4819163084030151, + "reward_std": 0.11179199814796448, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5057256817817688, + "rewards/correct_reward_func/std": 0.16328661143779755, + "step": 584 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2656.0, + "completions/max_terminated_length": 2656.0, + "completions/mean_length": 1526.857177734375, + "completions/mean_terminated_length": 1526.857177734375, + "completions/min_length": 1027.0, + "completions/min_terminated_length": 1027.0, + "epoch": 0.9112149532710281, + "grad_norm": 0.5592092871665955, + "kl": 0.052205150946974754, + "learning_rate": 1.64125e-06, + "loss": -0.0203, + "num_tokens": 75837530.0, + "reward": 1.4071049690246582, + "reward_std": 0.15914778411388397, + "rewards/contains_chinese/mean": 0.9523809552192688, + "rewards/contains_chinese/std": 0.21423791348934174, + "rewards/correct_reward_func/mean": 0.454723984003067, + "rewards/correct_reward_func/std": 0.16848695278167725, + "step": 585 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2317.0, + "completions/max_terminated_length": 2317.0, + "completions/mean_length": 1569.4881591796875, + "completions/mean_terminated_length": 1569.4881591796875, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "epoch": 0.9127725856697819, + "grad_norm": 0.5686935782432556, + "kl": 0.05123456381261349, + "learning_rate": 1.6406249999999999e-06, + "loss": 0.0122, + "num_tokens": 75975445.0, + "reward": 1.4490768909454346, + "reward_std": 0.06693350523710251, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4490768313407898, + "rewards/correct_reward_func/std": 0.16664773225784302, + "step": 586 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2338.0, + "completions/max_terminated_length": 2338.0, + "completions/mean_length": 1549.047607421875, + "completions/mean_terminated_length": 1549.047607421875, + "completions/min_length": 873.0, + "completions/min_terminated_length": 873.0, + "epoch": 0.9143302180685359, + "grad_norm": 0.566740095615387, + "kl": 0.05220544897019863, + "learning_rate": 1.6399999999999998e-06, + "loss": -0.0109, + "num_tokens": 76111565.0, + "reward": 1.464043378829956, + "reward_std": 0.12711408734321594, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4878527820110321, + "rewards/correct_reward_func/std": 0.16010543704032898, + "step": 587 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2122.0, + "completions/mean_length": 1619.8214111328125, + "completions/mean_terminated_length": 1540.6385498046875, + "completions/min_length": 834.0, + "completions/min_terminated_length": 834.0, + "epoch": 0.9158878504672897, + "grad_norm": 0.5762239694595337, + "kl": 0.052646003663539886, + "learning_rate": 1.6393749999999999e-06, + "loss": 0.0575, + "num_tokens": 76253672.0, + "reward": 1.441072702407837, + "reward_std": 0.0739678293466568, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44107261300086975, + "rewards/correct_reward_func/std": 0.12623170018196106, + "step": 588 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2457.0, + "completions/max_terminated_length": 2457.0, + "completions/mean_length": 1510.1309814453125, + "completions/mean_terminated_length": 1510.1309814453125, + "completions/min_length": 1069.0, + "completions/min_terminated_length": 1069.0, + "epoch": 0.9174454828660437, + "grad_norm": 0.5868603587150574, + "kl": 0.05117998085916042, + "learning_rate": 1.6387499999999998e-06, + "loss": -0.0027, + "num_tokens": 76386439.0, + "reward": 1.5747073888778687, + "reward_std": 0.06914177536964417, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5747074484825134, + "rewards/correct_reward_func/std": 0.14264759421348572, + "step": 589 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2371.0, + "completions/max_terminated_length": 2371.0, + "completions/mean_length": 1601.357177734375, + "completions/mean_terminated_length": 1601.357177734375, + "completions/min_length": 1061.0, + "completions/min_terminated_length": 1061.0, + "epoch": 0.9190031152647975, + "grad_norm": 0.5775259137153625, + "kl": 0.05480557680130005, + "learning_rate": 1.6381249999999999e-06, + "loss": 0.008, + "num_tokens": 76527043.0, + "reward": 1.4700545072555542, + "reward_std": 0.08241315186023712, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4819592535495758, + "rewards/correct_reward_func/std": 0.14215821027755737, + "step": 590 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2647.0, + "completions/max_terminated_length": 2647.0, + "completions/mean_length": 1482.797607421875, + "completions/mean_terminated_length": 1482.797607421875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "epoch": 0.9205607476635514, + "grad_norm": 0.599651575088501, + "kl": 0.05351861007511616, + "learning_rate": 1.6374999999999998e-06, + "loss": -0.0187, + "num_tokens": 76657436.0, + "reward": 1.5297892093658447, + "reward_std": 0.06223803758621216, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5297890901565552, + "rewards/correct_reward_func/std": 0.18300145864486694, + "step": 591 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2845.0, + "completions/max_terminated_length": 2845.0, + "completions/mean_length": 1613.2738037109375, + "completions/mean_terminated_length": 1613.2738037109375, + "completions/min_length": 1017.0, + "completions/min_terminated_length": 1017.0, + "epoch": 0.9221183800623053, + "grad_norm": 0.5727816224098206, + "kl": 0.0537562221288681, + "learning_rate": 1.636875e-06, + "loss": 0.012, + "num_tokens": 76798939.0, + "reward": 1.5009434223175049, + "reward_std": 0.08991846442222595, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.5247528553009033, + "rewards/correct_reward_func/std": 0.18158107995986938, + "step": 592 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2359.0, + "completions/mean_length": 1659.5714111328125, + "completions/mean_terminated_length": 1580.867431640625, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "epoch": 0.9236760124610592, + "grad_norm": 0.5455081462860107, + "kl": 0.05132809653878212, + "learning_rate": 1.63625e-06, + "loss": 0.0699, + "num_tokens": 76944613.0, + "reward": 1.4970442056655884, + "reward_std": 0.09464308619499207, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5089487433433533, + "rewards/correct_reward_func/std": 0.16302646696567535, + "step": 593 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2390.0, + "completions/max_terminated_length": 2390.0, + "completions/mean_length": 1536.5238037109375, + "completions/mean_terminated_length": 1536.5238037109375, + "completions/min_length": 719.0, + "completions/min_terminated_length": 719.0, + "epoch": 0.9252336448598131, + "grad_norm": 0.5868439078330994, + "kl": 0.05363127030432224, + "learning_rate": 1.6356250000000001e-06, + "loss": -0.0331, + "num_tokens": 77079573.0, + "reward": 1.4771671295166016, + "reward_std": 0.05991039052605629, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4771670699119568, + "rewards/correct_reward_func/std": 0.1585981547832489, + "step": 594 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2357.0, + "completions/max_terminated_length": 2357.0, + "completions/mean_length": 1595.8095703125, + "completions/mean_terminated_length": 1595.8095703125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.926791277258567, + "grad_norm": 0.5364544987678528, + "kl": 0.055180374532938004, + "learning_rate": 1.635e-06, + "loss": 0.0001, + "num_tokens": 77219441.0, + "reward": 1.4917380809783936, + "reward_std": 0.09097646921873093, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4917379915714264, + "rewards/correct_reward_func/std": 0.16991373896598816, + "step": 595 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2668.0, + "completions/max_terminated_length": 2668.0, + "completions/mean_length": 1606.0833740234375, + "completions/mean_terminated_length": 1606.0833740234375, + "completions/min_length": 1030.0, + "completions/min_terminated_length": 1030.0, + "epoch": 0.9283489096573209, + "grad_norm": 0.5445214509963989, + "kl": 0.05271473526954651, + "learning_rate": 1.634375e-06, + "loss": -0.023, + "num_tokens": 77360292.0, + "reward": 1.4440690279006958, + "reward_std": 0.048418521881103516, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44406890869140625, + "rewards/correct_reward_func/std": 0.16575324535369873, + "step": 596 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2422.0, + "completions/max_terminated_length": 2422.0, + "completions/mean_length": 1495.71435546875, + "completions/mean_terminated_length": 1495.71435546875, + "completions/min_length": 1008.0, + "completions/min_terminated_length": 1008.0, + "epoch": 0.9299065420560748, + "grad_norm": 0.5676112771034241, + "kl": 0.05709120258688927, + "learning_rate": 1.63375e-06, + "loss": -0.0011, + "num_tokens": 77491650.0, + "reward": 1.5036193132400513, + "reward_std": 0.07075376063585281, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5036192536354065, + "rewards/correct_reward_func/std": 0.15129926800727844, + "step": 597 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2412.0, + "completions/max_terminated_length": 2412.0, + "completions/mean_length": 1607.1785888671875, + "completions/mean_terminated_length": 1607.1785888671875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.9314641744548287, + "grad_norm": 0.5542066097259521, + "kl": 0.05464941821992397, + "learning_rate": 1.633125e-06, + "loss": 0.0103, + "num_tokens": 77632623.0, + "reward": 1.4994677305221558, + "reward_std": 0.09316570311784744, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.511372447013855, + "rewards/correct_reward_func/std": 0.10848263651132584, + "step": 598 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2396.0, + "completions/max_terminated_length": 2396.0, + "completions/mean_length": 1543.1309814453125, + "completions/mean_terminated_length": 1543.1309814453125, + "completions/min_length": 375.0, + "completions/min_terminated_length": 375.0, + "epoch": 0.9330218068535826, + "grad_norm": 0.5920741558074951, + "kl": 0.05645397678017616, + "learning_rate": 1.6325e-06, + "loss": 0.0179, + "num_tokens": 77768348.0, + "reward": 1.4137533903121948, + "reward_std": 0.130188450217247, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669144809246063, + "rewards/correct_reward_func/mean": 0.44946759939193726, + "rewards/correct_reward_func/std": 0.1394752562046051, + "step": 599 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2292.0, + "completions/max_terminated_length": 2292.0, + "completions/mean_length": 1543.8214111328125, + "completions/mean_terminated_length": 1543.8214111328125, + "completions/min_length": 1063.0, + "completions/min_terminated_length": 1063.0, + "epoch": 0.9345794392523364, + "grad_norm": 0.629948079586029, + "kl": 0.05816573277115822, + "learning_rate": 1.631875e-06, + "loss": 0.008, + "num_tokens": 77903795.0, + "reward": 1.4406551122665405, + "reward_std": 0.08957747370004654, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44065502285957336, + "rewards/correct_reward_func/std": 0.16707439720630646, + "step": 600 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2268.0, + "completions/mean_length": 1753.5714111328125, + "completions/mean_terminated_length": 1675.9998779296875, + "completions/min_length": 1085.0, + "completions/min_terminated_length": 1085.0, + "epoch": 0.9361370716510904, + "grad_norm": 0.596502959728241, + "kl": 0.05688577890396118, + "learning_rate": 1.63125e-06, + "loss": 0.0596, + "num_tokens": 78057203.0, + "reward": 1.4966117143630981, + "reward_std": 0.08040610700845718, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4966115951538086, + "rewards/correct_reward_func/std": 0.22688382863998413, + "step": 601 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3153.0, + "completions/max_terminated_length": 3153.0, + "completions/mean_length": 1590.25, + "completions/mean_terminated_length": 1590.25, + "completions/min_length": 910.0, + "completions/min_terminated_length": 910.0, + "epoch": 0.9376947040498442, + "grad_norm": 0.5286832451820374, + "kl": 0.05895489826798439, + "learning_rate": 1.630625e-06, + "loss": -0.0303, + "num_tokens": 78196856.0, + "reward": 1.4874906539916992, + "reward_std": 0.044962868094444275, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48749059438705444, + "rewards/correct_reward_func/std": 0.1546032726764679, + "step": 602 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2431.0, + "completions/max_terminated_length": 2431.0, + "completions/mean_length": 1600.21435546875, + "completions/mean_terminated_length": 1600.21435546875, + "completions/min_length": 681.0, + "completions/min_terminated_length": 681.0, + "epoch": 0.9392523364485982, + "grad_norm": 0.5734366178512573, + "kl": 0.05501212365925312, + "learning_rate": 1.6299999999999999e-06, + "loss": 0.0089, + "num_tokens": 78337274.0, + "reward": 1.5072236061096191, + "reward_std": 0.05490497127175331, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5072234869003296, + "rewards/correct_reward_func/std": 0.11040540784597397, + "step": 603 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2413.0, + "completions/max_terminated_length": 2413.0, + "completions/mean_length": 1633.90478515625, + "completions/mean_terminated_length": 1633.90478515625, + "completions/min_length": 1092.0, + "completions/min_terminated_length": 1092.0, + "epoch": 0.940809968847352, + "grad_norm": 0.5251239538192749, + "kl": 0.059018656611442566, + "learning_rate": 1.629375e-06, + "loss": -0.0014, + "num_tokens": 78480546.0, + "reward": 1.4861642122268677, + "reward_std": 0.06594221293926239, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4980688989162445, + "rewards/correct_reward_func/std": 0.16121798753738403, + "step": 604 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2623.0, + "completions/max_terminated_length": 2623.0, + "completions/mean_length": 1652.1309814453125, + "completions/mean_terminated_length": 1652.1309814453125, + "completions/min_length": 776.0, + "completions/min_terminated_length": 776.0, + "epoch": 0.942367601246106, + "grad_norm": 0.5935388803482056, + "kl": 0.051552364602684975, + "learning_rate": 1.6287499999999999e-06, + "loss": 0.0244, + "num_tokens": 78625445.0, + "reward": 1.4352984428405762, + "reward_std": 0.13022708892822266, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669144809246063, + "rewards/correct_reward_func/mean": 0.47101256251335144, + "rewards/correct_reward_func/std": 0.1250392645597458, + "step": 605 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3299.0, + "completions/mean_length": 1707.4405517578125, + "completions/mean_terminated_length": 1629.313232421875, + "completions/min_length": 1127.0, + "completions/min_terminated_length": 1127.0, + "epoch": 0.9439252336448598, + "grad_norm": 0.6000062227249146, + "kl": 0.054089561104774475, + "learning_rate": 1.628125e-06, + "loss": 0.0647, + "num_tokens": 78774852.0, + "reward": 1.4837253093719482, + "reward_std": 0.07214730232954025, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4837252199649811, + "rewards/correct_reward_func/std": 0.1362723410129547, + "step": 606 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2870.0, + "completions/mean_length": 1669.357177734375, + "completions/mean_terminated_length": 1590.77099609375, + "completions/min_length": 1059.0, + "completions/min_terminated_length": 1059.0, + "epoch": 0.9454828660436138, + "grad_norm": 0.5420730113983154, + "kl": 0.051561569795012474, + "learning_rate": 1.6274999999999999e-06, + "loss": 0.0154, + "num_tokens": 78921096.0, + "reward": 1.472241759300232, + "reward_std": 0.10751719772815704, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4841463565826416, + "rewards/correct_reward_func/std": 0.1544354408979416, + "step": 607 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2360.0, + "completions/max_terminated_length": 2360.0, + "completions/mean_length": 1506.25, + "completions/mean_terminated_length": 1506.25, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "epoch": 0.9470404984423676, + "grad_norm": 0.6013126373291016, + "kl": 0.05470665171742439, + "learning_rate": 1.626875e-06, + "loss": 0.0038, + "num_tokens": 79053429.0, + "reward": 1.5111743211746216, + "reward_std": 0.06690473109483719, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5111743211746216, + "rewards/correct_reward_func/std": 0.18495941162109375, + "step": 608 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2235.0, + "completions/mean_length": 1583.166748046875, + "completions/mean_terminated_length": 1503.5421142578125, + "completions/min_length": 961.0, + "completions/min_terminated_length": 961.0, + "epoch": 0.9485981308411215, + "grad_norm": 0.5386534929275513, + "kl": 0.051652608439326286, + "learning_rate": 1.6262499999999999e-06, + "loss": 0.056, + "num_tokens": 79192355.0, + "reward": 1.4510307312011719, + "reward_std": 0.0934084877371788, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4629353880882263, + "rewards/correct_reward_func/std": 0.14852337539196014, + "step": 609 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2729.0, + "completions/max_terminated_length": 2729.0, + "completions/mean_length": 1621.202392578125, + "completions/mean_terminated_length": 1621.202392578125, + "completions/min_length": 971.0, + "completions/min_terminated_length": 971.0, + "epoch": 0.9501557632398754, + "grad_norm": 0.6040880680084229, + "kl": 0.053494108840823174, + "learning_rate": 1.625625e-06, + "loss": -0.0353, + "num_tokens": 79334620.0, + "reward": 1.4715975522994995, + "reward_std": 0.0786285549402237, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4715975821018219, + "rewards/correct_reward_func/std": 0.11298926919698715, + "step": 610 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2967.0, + "completions/max_terminated_length": 2967.0, + "completions/mean_length": 1519.4285888671875, + "completions/mean_terminated_length": 1519.4285888671875, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "epoch": 0.9517133956386293, + "grad_norm": 0.5844832062721252, + "kl": 0.0530572235584259, + "learning_rate": 1.625e-06, + "loss": 0.0093, + "num_tokens": 79468030.0, + "reward": 1.5320613384246826, + "reward_std": 0.05157333239912987, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5320611596107483, + "rewards/correct_reward_func/std": 0.14916525781154633, + "step": 611 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2165.0, + "completions/max_terminated_length": 2165.0, + "completions/mean_length": 1606.5, + "completions/mean_terminated_length": 1606.5, + "completions/min_length": 862.0, + "completions/min_terminated_length": 862.0, + "epoch": 0.9532710280373832, + "grad_norm": 0.5951868295669556, + "kl": 0.05439838580787182, + "learning_rate": 1.6243749999999998e-06, + "loss": -0.0238, + "num_tokens": 79608940.0, + "reward": 1.4870160818099976, + "reward_std": 0.1107514277100563, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4989207088947296, + "rewards/correct_reward_func/std": 0.1661691665649414, + "step": 612 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2785.0, + "completions/max_terminated_length": 2785.0, + "completions/mean_length": 1537.761962890625, + "completions/mean_terminated_length": 1537.761962890625, + "completions/min_length": 875.0, + "completions/min_terminated_length": 875.0, + "epoch": 0.9548286604361371, + "grad_norm": 0.5835762023925781, + "kl": 0.054533904418349266, + "learning_rate": 1.62375e-06, + "loss": 0.0028, + "num_tokens": 79744142.0, + "reward": 1.4982998371124268, + "reward_std": 0.08925885707139969, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5102044939994812, + "rewards/correct_reward_func/std": 0.21719758212566376, + "step": 613 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2433.0, + "completions/max_terminated_length": 2433.0, + "completions/mean_length": 1523.2261962890625, + "completions/mean_terminated_length": 1523.2261962890625, + "completions/min_length": 857.0, + "completions/min_terminated_length": 857.0, + "epoch": 0.956386292834891, + "grad_norm": 0.5852372646331787, + "kl": 0.05277659185230732, + "learning_rate": 1.6231249999999998e-06, + "loss": -0.0142, + "num_tokens": 79878015.0, + "reward": 1.458331823348999, + "reward_std": 0.0705060064792633, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4583317041397095, + "rewards/correct_reward_func/std": 0.1471869796514511, + "step": 614 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2318.0, + "completions/max_terminated_length": 2318.0, + "completions/mean_length": 1595.261962890625, + "completions/mean_terminated_length": 1595.261962890625, + "completions/min_length": 1045.0, + "completions/min_terminated_length": 1045.0, + "epoch": 0.9579439252336449, + "grad_norm": 0.5480629801750183, + "kl": 0.0494478065520525, + "learning_rate": 1.6225e-06, + "loss": -0.0132, + "num_tokens": 80017915.0, + "reward": 1.4836761951446533, + "reward_std": 0.06662869453430176, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4836761951446533, + "rewards/correct_reward_func/std": 0.1571483314037323, + "step": 615 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2420.0, + "completions/max_terminated_length": 2420.0, + "completions/mean_length": 1565.5238037109375, + "completions/mean_terminated_length": 1565.5238037109375, + "completions/min_length": 920.0, + "completions/min_terminated_length": 920.0, + "epoch": 0.9595015576323987, + "grad_norm": 0.5563946962356567, + "kl": 0.05082782916724682, + "learning_rate": 1.6218749999999998e-06, + "loss": 0.0008, + "num_tokens": 80155491.0, + "reward": 1.5200374126434326, + "reward_std": 0.0669645443558693, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5200372934341431, + "rewards/correct_reward_func/std": 0.18864794075489044, + "step": 616 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2085.0, + "completions/max_terminated_length": 2085.0, + "completions/mean_length": 1500.7381591796875, + "completions/mean_terminated_length": 1500.7381591796875, + "completions/min_length": 438.0, + "completions/min_terminated_length": 438.0, + "epoch": 0.9610591900311527, + "grad_norm": 0.5845383405685425, + "kl": 0.051540493965148926, + "learning_rate": 1.62125e-06, + "loss": -0.0132, + "num_tokens": 80287547.0, + "reward": 1.5222618579864502, + "reward_std": 0.08891423046588898, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5341665148735046, + "rewards/correct_reward_func/std": 0.1643657386302948, + "step": 617 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2304.0, + "completions/max_terminated_length": 2304.0, + "completions/mean_length": 1515.0357666015625, + "completions/mean_terminated_length": 1515.0357666015625, + "completions/min_length": 872.0, + "completions/min_terminated_length": 872.0, + "epoch": 0.9626168224299065, + "grad_norm": 0.6092395782470703, + "kl": 0.05296482518315315, + "learning_rate": 1.6206249999999998e-06, + "loss": 0.0086, + "num_tokens": 80420648.0, + "reward": 1.4399313926696777, + "reward_std": 0.08079881966114044, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.43993115425109863, + "rewards/correct_reward_func/std": 0.13497735559940338, + "step": 618 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2472.0, + "completions/max_terminated_length": 2472.0, + "completions/mean_length": 1513.666748046875, + "completions/mean_terminated_length": 1513.666748046875, + "completions/min_length": 987.0, + "completions/min_terminated_length": 987.0, + "epoch": 0.9641744548286605, + "grad_norm": 0.5746808052062988, + "kl": 0.05054004117846489, + "learning_rate": 1.62e-06, + "loss": -0.0325, + "num_tokens": 80553754.0, + "reward": 1.478222370147705, + "reward_std": 0.05955832451581955, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4782223105430603, + "rewards/correct_reward_func/std": 0.13707558810710907, + "step": 619 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3365.0, + "completions/max_terminated_length": 3365.0, + "completions/mean_length": 1535.46435546875, + "completions/mean_terminated_length": 1535.46435546875, + "completions/min_length": 869.0, + "completions/min_terminated_length": 869.0, + "epoch": 0.9657320872274143, + "grad_norm": 0.6188405752182007, + "kl": 0.05118212662637234, + "learning_rate": 1.6193749999999998e-06, + "loss": 0.068, + "num_tokens": 80688679.0, + "reward": 1.4972518682479858, + "reward_std": 0.10318266600370407, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.521061360836029, + "rewards/correct_reward_func/std": 0.15406495332717896, + "step": 620 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2192.0, + "completions/mean_length": 1599.0357666015625, + "completions/mean_terminated_length": 1519.602294921875, + "completions/min_length": 1039.0, + "completions/min_terminated_length": 1039.0, + "epoch": 0.9672897196261683, + "grad_norm": 0.5300514698028564, + "kl": 0.04921773634850979, + "learning_rate": 1.6187499999999997e-06, + "loss": 0.0581, + "num_tokens": 80829022.0, + "reward": 1.4415113925933838, + "reward_std": 0.08664444088935852, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44151124358177185, + "rewards/correct_reward_func/std": 0.12956391274929047, + "step": 621 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2215.0, + "completions/mean_length": 1577.96435546875, + "completions/mean_terminated_length": 1498.277099609375, + "completions/min_length": 996.0, + "completions/min_terminated_length": 996.0, + "epoch": 0.9688473520249221, + "grad_norm": 0.585616409778595, + "kl": 0.0514204315841198, + "learning_rate": 1.6181249999999999e-06, + "loss": 0.0696, + "num_tokens": 80967613.0, + "reward": 1.4526182413101196, + "reward_std": 0.07754890620708466, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45261818170547485, + "rewards/correct_reward_func/std": 0.14640595018863678, + "step": 622 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2377.0, + "completions/mean_length": 1603.0714111328125, + "completions/mean_terminated_length": 1523.6866455078125, + "completions/min_length": 851.0, + "completions/min_terminated_length": 851.0, + "epoch": 0.9704049844236761, + "grad_norm": 0.6005199551582336, + "kl": 0.05098399519920349, + "learning_rate": 1.6174999999999998e-06, + "loss": 0.0495, + "num_tokens": 81108259.0, + "reward": 1.4635608196258545, + "reward_std": 0.09107384085655212, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4754655957221985, + "rewards/correct_reward_func/std": 0.12346034497022629, + "step": 623 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2418.0, + "completions/max_terminated_length": 2418.0, + "completions/mean_length": 1519.261962890625, + "completions/mean_terminated_length": 1519.261962890625, + "completions/min_length": 861.0, + "completions/min_terminated_length": 861.0, + "epoch": 0.9719626168224299, + "grad_norm": 0.6062273383140564, + "kl": 0.049668088555336, + "learning_rate": 1.616875e-06, + "loss": 0.031, + "num_tokens": 81241739.0, + "reward": 1.4660530090332031, + "reward_std": 0.062135469168424606, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4660530090332031, + "rewards/correct_reward_func/std": 0.12143100053071976, + "step": 624 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2461.0, + "completions/max_terminated_length": 2461.0, + "completions/mean_length": 1491.5714111328125, + "completions/mean_terminated_length": 1491.5714111328125, + "completions/min_length": 906.0, + "completions/min_terminated_length": 906.0, + "epoch": 0.9735202492211839, + "grad_norm": 0.6340486407279968, + "kl": 0.05301540531218052, + "learning_rate": 1.61625e-06, + "loss": -0.0211, + "num_tokens": 81372875.0, + "reward": 1.487614393234253, + "reward_std": 0.0698120966553688, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48761430382728577, + "rewards/correct_reward_func/std": 0.13601452112197876, + "step": 625 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2163.0, + "completions/max_terminated_length": 2163.0, + "completions/mean_length": 1590.8929443359375, + "completions/mean_terminated_length": 1590.8929443359375, + "completions/min_length": 893.0, + "completions/min_terminated_length": 893.0, + "epoch": 0.9750778816199377, + "grad_norm": 0.5444204807281494, + "kl": 0.0505395382642746, + "learning_rate": 1.615625e-06, + "loss": 0.0167, + "num_tokens": 81512804.0, + "reward": 1.4724253416061401, + "reward_std": 0.05580959469079971, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47242528200149536, + "rewards/correct_reward_func/std": 0.12042959779500961, + "step": 626 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2706.0, + "completions/max_terminated_length": 2706.0, + "completions/mean_length": 1554.75, + "completions/mean_terminated_length": 1554.75, + "completions/min_length": 795.0, + "completions/min_terminated_length": 795.0, + "epoch": 0.9766355140186916, + "grad_norm": 0.5865889191627502, + "kl": 0.05015707015991211, + "learning_rate": 1.615e-06, + "loss": 0.004, + "num_tokens": 81649517.0, + "reward": 1.4877718687057495, + "reward_std": 0.08064839243888855, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4996766448020935, + "rewards/correct_reward_func/std": 0.1834038645029068, + "step": 627 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2110.0, + "completions/max_terminated_length": 2110.0, + "completions/mean_length": 1443.4761962890625, + "completions/mean_terminated_length": 1443.4761962890625, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "epoch": 0.9781931464174455, + "grad_norm": 0.5513234734535217, + "kl": 0.054139742627739906, + "learning_rate": 1.614375e-06, + "loss": 0.006, + "num_tokens": 81776583.0, + "reward": 1.5102035999298096, + "reward_std": 0.05765219032764435, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5102035999298096, + "rewards/correct_reward_func/std": 0.1545766443014145, + "step": 628 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2619.0, + "completions/max_terminated_length": 2619.0, + "completions/mean_length": 1546.21435546875, + "completions/mean_terminated_length": 1546.21435546875, + "completions/min_length": 775.0, + "completions/min_terminated_length": 775.0, + "epoch": 0.9797507788161994, + "grad_norm": 0.592833936214447, + "kl": 0.05099848657846451, + "learning_rate": 1.61375e-06, + "loss": 0.0172, + "num_tokens": 81912435.0, + "reward": 1.4125304222106934, + "reward_std": 0.08528114855289459, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.424435019493103, + "rewards/correct_reward_func/std": 0.11915619671344757, + "step": 629 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2346.0, + "completions/mean_length": 1624.5238037109375, + "completions/mean_terminated_length": 1545.3975830078125, + "completions/min_length": 1048.0, + "completions/min_terminated_length": 1048.0, + "epoch": 0.9813084112149533, + "grad_norm": 0.5546726584434509, + "kl": 0.05285438522696495, + "learning_rate": 1.613125e-06, + "loss": 0.0583, + "num_tokens": 82054865.0, + "reward": 1.5317888259887695, + "reward_std": 0.12638387084007263, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5436934232711792, + "rewards/correct_reward_func/std": 0.1404784917831421, + "step": 630 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3120.0, + "completions/max_terminated_length": 3120.0, + "completions/mean_length": 1527.8095703125, + "completions/mean_terminated_length": 1527.8095703125, + "completions/min_length": 993.0, + "completions/min_terminated_length": 993.0, + "epoch": 0.9828660436137072, + "grad_norm": 0.5793138146400452, + "kl": 0.052825529128313065, + "learning_rate": 1.6125e-06, + "loss": 0.0209, + "num_tokens": 82189249.0, + "reward": 1.470069408416748, + "reward_std": 0.07235594838857651, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4700692892074585, + "rewards/correct_reward_func/std": 0.13610175251960754, + "step": 631 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3391.0, + "completions/mean_length": 1610.011962890625, + "completions/mean_terminated_length": 1530.7108154296875, + "completions/min_length": 843.0, + "completions/min_terminated_length": 843.0, + "epoch": 0.9844236760124611, + "grad_norm": 0.5624516606330872, + "kl": 0.0496527124196291, + "learning_rate": 1.611875e-06, + "loss": 0.0426, + "num_tokens": 82330616.0, + "reward": 1.531969428062439, + "reward_std": 0.10485806316137314, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.543874204158783, + "rewards/correct_reward_func/std": 0.16279828548431396, + "step": 632 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2136.0, + "completions/max_terminated_length": 2136.0, + "completions/mean_length": 1469.1190185546875, + "completions/mean_terminated_length": 1469.1190185546875, + "completions/min_length": 953.0, + "completions/min_terminated_length": 953.0, + "epoch": 0.985981308411215, + "grad_norm": 0.5987145304679871, + "kl": 0.052102504298090935, + "learning_rate": 1.61125e-06, + "loss": 0.019, + "num_tokens": 82459950.0, + "reward": 1.5328919887542725, + "reward_std": 0.08720895648002625, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5447967648506165, + "rewards/correct_reward_func/std": 0.17077518999576569, + "step": 633 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2012.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 1462.15478515625, + "completions/mean_terminated_length": 1462.15478515625, + "completions/min_length": 789.0, + "completions/min_terminated_length": 789.0, + "epoch": 0.9875389408099688, + "grad_norm": 0.5946549773216248, + "kl": 0.05218057334423065, + "learning_rate": 1.610625e-06, + "loss": -0.0021, + "num_tokens": 82588483.0, + "reward": 1.4602668285369873, + "reward_std": 0.06267713755369186, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46026673913002014, + "rewards/correct_reward_func/std": 0.11811360716819763, + "step": 634 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2227.0, + "completions/max_terminated_length": 2227.0, + "completions/mean_length": 1475.15478515625, + "completions/mean_terminated_length": 1475.15478515625, + "completions/min_length": 870.0, + "completions/min_terminated_length": 870.0, + "epoch": 0.9890965732087228, + "grad_norm": 0.6137917041778564, + "kl": 0.05290712043642998, + "learning_rate": 1.61e-06, + "loss": 0.0123, + "num_tokens": 82718324.0, + "reward": 1.4867254495620728, + "reward_std": 0.060699447989463806, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.486725389957428, + "rewards/correct_reward_func/std": 0.15401968359947205, + "step": 635 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3418.0, + "completions/mean_length": 1539.71435546875, + "completions/mean_terminated_length": 1459.566162109375, + "completions/min_length": 876.0, + "completions/min_terminated_length": 876.0, + "epoch": 0.9906542056074766, + "grad_norm": 0.5406956672668457, + "kl": 0.05111967213451862, + "learning_rate": 1.609375e-06, + "loss": 0.0556, + "num_tokens": 82853534.0, + "reward": 1.5321414470672607, + "reward_std": 0.08517434448003769, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.532141387462616, + "rewards/correct_reward_func/std": 0.1582731455564499, + "step": 636 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2517.0, + "completions/max_terminated_length": 2517.0, + "completions/mean_length": 1444.59521484375, + "completions/mean_terminated_length": 1444.59521484375, + "completions/min_length": 589.0, + "completions/min_terminated_length": 589.0, + "epoch": 0.9922118380062306, + "grad_norm": 0.5991398096084595, + "kl": 0.04915725626051426, + "learning_rate": 1.6087499999999998e-06, + "loss": 0.0246, + "num_tokens": 82980928.0, + "reward": 1.5238535404205322, + "reward_std": 0.04892566055059433, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5238535404205322, + "rewards/correct_reward_func/std": 0.12992213666439056, + "step": 637 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2228.0, + "completions/max_terminated_length": 2228.0, + "completions/mean_length": 1405.952392578125, + "completions/mean_terminated_length": 1405.952392578125, + "completions/min_length": 444.0, + "completions/min_terminated_length": 444.0, + "epoch": 0.9937694704049844, + "grad_norm": 0.6216338872909546, + "kl": 0.052867574617266655, + "learning_rate": 1.608125e-06, + "loss": 0.0101, + "num_tokens": 83104848.0, + "reward": 1.5133877992630005, + "reward_std": 0.07994896173477173, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5133876204490662, + "rewards/correct_reward_func/std": 0.1903848499059677, + "step": 638 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2685.0, + "completions/max_terminated_length": 2685.0, + "completions/mean_length": 1475.5833740234375, + "completions/mean_terminated_length": 1475.5833740234375, + "completions/min_length": 629.0, + "completions/min_terminated_length": 629.0, + "epoch": 0.9953271028037384, + "grad_norm": 0.5925660133361816, + "kl": 0.05206291750073433, + "learning_rate": 1.6074999999999999e-06, + "loss": 0.0255, + "num_tokens": 83234737.0, + "reward": 1.497523307800293, + "reward_std": 0.0626191571354866, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4975232183933258, + "rewards/correct_reward_func/std": 0.18619437515735626, + "step": 639 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 5409.0, + "completions/max_terminated_length": 5409.0, + "completions/mean_length": 1527.107177734375, + "completions/mean_terminated_length": 1527.107177734375, + "completions/min_length": 227.0, + "completions/min_terminated_length": 227.0, + "epoch": 0.9968847352024922, + "grad_norm": 0.6247422695159912, + "kl": 0.04974444583058357, + "learning_rate": 1.606875e-06, + "loss": -0.0138, + "num_tokens": 83369086.0, + "reward": 1.4803240299224854, + "reward_std": 0.07588109374046326, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4922286570072174, + "rewards/correct_reward_func/std": 0.15059354901313782, + "step": 640 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2251.0, + "completions/max_terminated_length": 2251.0, + "completions/mean_length": 1502.4761962890625, + "completions/mean_terminated_length": 1502.4761962890625, + "completions/min_length": 786.0, + "completions/min_terminated_length": 786.0, + "epoch": 0.9984423676012462, + "grad_norm": 0.5777775645256042, + "kl": 0.050743360072374344, + "learning_rate": 1.6062499999999999e-06, + "loss": -0.0207, + "num_tokens": 83501378.0, + "reward": 1.5227001905441284, + "reward_std": 0.05805504322052002, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5227001309394836, + "rewards/correct_reward_func/std": 0.12714529037475586, + "step": 641 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2487.0, + "completions/max_terminated_length": 2487.0, + "completions/mean_length": 1441.857177734375, + "completions/mean_terminated_length": 1441.857177734375, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "epoch": 1.0, + "grad_norm": 0.6082573533058167, + "kl": 0.05472211726009846, + "learning_rate": 1.605625e-06, + "loss": 0.0021, + "num_tokens": 83628314.0, + "reward": 1.4580137729644775, + "reward_std": 0.11369025707244873, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4818231463432312, + "rewards/correct_reward_func/std": 0.1592247188091278, + "step": 642 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2915.0, + "completions/mean_length": 1665.71435546875, + "completions/mean_terminated_length": 1587.084228515625, + "completions/min_length": 1055.0, + "completions/min_terminated_length": 1055.0, + "epoch": 1.0015576323987538, + "grad_norm": 0.5756648778915405, + "kl": 0.05213580280542374, + "learning_rate": 1.6049999999999999e-06, + "loss": 0.0548, + "num_tokens": 83774564.0, + "reward": 1.4815843105316162, + "reward_std": 0.07832953333854675, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48158419132232666, + "rewards/correct_reward_func/std": 0.12412244081497192, + "step": 643 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2377.0, + "completions/max_terminated_length": 2377.0, + "completions/mean_length": 1476.797607421875, + "completions/mean_terminated_length": 1476.797607421875, + "completions/min_length": 644.0, + "completions/min_terminated_length": 644.0, + "epoch": 1.0031152647975077, + "grad_norm": 0.594108521938324, + "kl": 0.05231509543955326, + "learning_rate": 1.604375e-06, + "loss": 0.0001, + "num_tokens": 83904741.0, + "reward": 1.5084398984909058, + "reward_std": 0.08493813127279282, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5203444957733154, + "rewards/correct_reward_func/std": 0.11909134685993195, + "step": 644 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2452.0, + "completions/max_terminated_length": 2452.0, + "completions/mean_length": 1510.357177734375, + "completions/mean_terminated_length": 1510.357177734375, + "completions/min_length": 867.0, + "completions/min_terminated_length": 867.0, + "epoch": 1.0046728971962617, + "grad_norm": 0.6033718585968018, + "kl": 0.05159814655780792, + "learning_rate": 1.6037499999999999e-06, + "loss": -0.0043, + "num_tokens": 84037497.0, + "reward": 1.5596506595611572, + "reward_std": 0.06282084435224533, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5596506595611572, + "rewards/correct_reward_func/std": 0.18580923974514008, + "step": 645 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2500.0, + "completions/max_terminated_length": 2500.0, + "completions/mean_length": 1567.84521484375, + "completions/mean_terminated_length": 1567.84521484375, + "completions/min_length": 1075.0, + "completions/min_terminated_length": 1075.0, + "epoch": 1.0062305295950156, + "grad_norm": 0.5789066553115845, + "kl": 0.052623504772782326, + "learning_rate": 1.6031249999999998e-06, + "loss": 0.0338, + "num_tokens": 84175148.0, + "reward": 1.5069093704223633, + "reward_std": 0.08146540820598602, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5188140869140625, + "rewards/correct_reward_func/std": 0.13371284306049347, + "step": 646 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2175.0, + "completions/max_terminated_length": 2175.0, + "completions/mean_length": 1484.5, + "completions/mean_terminated_length": 1484.5, + "completions/min_length": 798.0, + "completions/min_terminated_length": 798.0, + "epoch": 1.0077881619937694, + "grad_norm": 0.5949665307998657, + "kl": 0.05212471820414066, + "learning_rate": 1.6025e-06, + "loss": 0.0168, + "num_tokens": 84305870.0, + "reward": 1.5230427980422974, + "reward_std": 0.049363430589437485, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5230426788330078, + "rewards/correct_reward_func/std": 0.12559856474399567, + "step": 647 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.023809523809523836, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 3188.0, + "completions/mean_length": 1677.916748046875, + "completions/mean_terminated_length": 1519.0364990234375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 1.0093457943925233, + "grad_norm": 0.5194585919380188, + "kl": 0.04932490363717079, + "learning_rate": 1.6018749999999998e-06, + "loss": 0.0928, + "num_tokens": 84452887.0, + "reward": 1.4853767156600952, + "reward_std": 0.08370485156774521, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4853765368461609, + "rewards/correct_reward_func/std": 0.16985315084457397, + "step": 648 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2393.0, + "completions/max_terminated_length": 2393.0, + "completions/mean_length": 1499.5714111328125, + "completions/mean_terminated_length": 1499.5714111328125, + "completions/min_length": 778.0, + "completions/min_terminated_length": 778.0, + "epoch": 1.0109034267912773, + "grad_norm": 0.6075387597084045, + "kl": 0.05233858525753021, + "learning_rate": 1.60125e-06, + "loss": 0.0018, + "num_tokens": 84585019.0, + "reward": 1.551127314567566, + "reward_std": 0.05537908524274826, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5511272549629211, + "rewards/correct_reward_func/std": 0.14352430403232574, + "step": 649 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2385.0, + "completions/max_terminated_length": 2385.0, + "completions/mean_length": 1476.90478515625, + "completions/mean_terminated_length": 1476.90478515625, + "completions/min_length": 824.0, + "completions/min_terminated_length": 824.0, + "epoch": 1.0124610591900312, + "grad_norm": 0.5924170017242432, + "kl": 0.05188839137554169, + "learning_rate": 1.6006249999999998e-06, + "loss": 0.0245, + "num_tokens": 84714923.0, + "reward": 1.5396331548690796, + "reward_std": 0.06312854588031769, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5396330952644348, + "rewards/correct_reward_func/std": 0.1734156459569931, + "step": 650 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2618.0, + "completions/max_terminated_length": 2618.0, + "completions/mean_length": 1465.4881591796875, + "completions/mean_terminated_length": 1465.4881591796875, + "completions/min_length": 618.0, + "completions/min_terminated_length": 618.0, + "epoch": 1.014018691588785, + "grad_norm": 0.597648024559021, + "kl": 0.05065512843430042, + "learning_rate": 1.6e-06, + "loss": 0.0192, + "num_tokens": 84844012.0, + "reward": 1.5225285291671753, + "reward_std": 0.05072065815329552, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5225285291671753, + "rewards/correct_reward_func/std": 0.17245061695575714, + "step": 651 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2194.0, + "completions/max_terminated_length": 2194.0, + "completions/mean_length": 1471.21435546875, + "completions/mean_terminated_length": 1471.21435546875, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "epoch": 1.0155763239875388, + "grad_norm": 0.5615136623382568, + "kl": 0.05269411765038967, + "learning_rate": 1.5993749999999998e-06, + "loss": -0.0099, + "num_tokens": 84973576.0, + "reward": 1.488713026046753, + "reward_std": 0.05458061024546623, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48871293663978577, + "rewards/correct_reward_func/std": 0.1673632711172104, + "step": 652 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3245.0, + "completions/max_terminated_length": 3245.0, + "completions/mean_length": 1455.2738037109375, + "completions/mean_terminated_length": 1455.2738037109375, + "completions/min_length": 853.0, + "completions/min_terminated_length": 853.0, + "epoch": 1.017133956386293, + "grad_norm": 0.5472370386123657, + "kl": 0.05032069608569145, + "learning_rate": 1.5987499999999997e-06, + "loss": -0.0272, + "num_tokens": 85101609.0, + "reward": 1.532504916191101, + "reward_std": 0.04785650223493576, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5325048565864563, + "rewards/correct_reward_func/std": 0.19464264810085297, + "step": 653 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2291.0, + "completions/max_terminated_length": 2291.0, + "completions/mean_length": 1474.0714111328125, + "completions/mean_terminated_length": 1474.0714111328125, + "completions/min_length": 463.0, + "completions/min_terminated_length": 463.0, + "epoch": 1.0186915887850467, + "grad_norm": 0.5767446756362915, + "kl": 0.05521121807396412, + "learning_rate": 1.5981249999999998e-06, + "loss": -0.0144, + "num_tokens": 85231431.0, + "reward": 1.4579790830612183, + "reward_std": 0.09587103873491287, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.46988385915756226, + "rewards/correct_reward_func/std": 0.13316239416599274, + "step": 654 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2599.0, + "completions/max_terminated_length": 2599.0, + "completions/mean_length": 1522.8333740234375, + "completions/mean_terminated_length": 1522.8333740234375, + "completions/min_length": 877.0, + "completions/min_terminated_length": 877.0, + "epoch": 1.0202492211838006, + "grad_norm": 0.5800827741622925, + "kl": 0.05396328121423721, + "learning_rate": 1.5975e-06, + "loss": 0.0364, + "num_tokens": 85365211.0, + "reward": 1.501022219657898, + "reward_std": 0.076015904545784, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5010221600532532, + "rewards/correct_reward_func/std": 0.15128661692142487, + "step": 655 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2184.0, + "completions/max_terminated_length": 2184.0, + "completions/mean_length": 1491.1429443359375, + "completions/mean_terminated_length": 1491.1429443359375, + "completions/min_length": 734.0, + "completions/min_terminated_length": 734.0, + "epoch": 1.0218068535825544, + "grad_norm": 0.5550655126571655, + "kl": 0.05126112699508667, + "learning_rate": 1.596875e-06, + "loss": 0.0014, + "num_tokens": 85496383.0, + "reward": 1.4312056303024292, + "reward_std": 0.08455885946750641, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.44311028718948364, + "rewards/correct_reward_func/std": 0.12594355642795563, + "step": 656 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2711.0, + "completions/max_terminated_length": 2711.0, + "completions/mean_length": 1563.452392578125, + "completions/mean_terminated_length": 1563.452392578125, + "completions/min_length": 995.0, + "completions/min_terminated_length": 995.0, + "epoch": 1.0233644859813085, + "grad_norm": 0.5272036194801331, + "kl": 0.05299381539225578, + "learning_rate": 1.59625e-06, + "loss": -0.0094, + "num_tokens": 85633941.0, + "reward": 1.4453727006912231, + "reward_std": 0.060865722596645355, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.44537264108657837, + "rewards/correct_reward_func/std": 0.12575693428516388, + "step": 657 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2520.0, + "completions/max_terminated_length": 2520.0, + "completions/mean_length": 1494.107177734375, + "completions/mean_terminated_length": 1494.107177734375, + "completions/min_length": 879.0, + "completions/min_terminated_length": 879.0, + "epoch": 1.0249221183800623, + "grad_norm": 0.629683792591095, + "kl": 0.053053101524710655, + "learning_rate": 1.595625e-06, + "loss": -0.0115, + "num_tokens": 85765350.0, + "reward": 1.4953358173370361, + "reward_std": 0.06038458272814751, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.49533572793006897, + "rewards/correct_reward_func/std": 0.1519479900598526, + "step": 658 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2727.0, + "completions/max_terminated_length": 2727.0, + "completions/mean_length": 1514.0833740234375, + "completions/mean_terminated_length": 1514.0833740234375, + "completions/min_length": 781.0, + "completions/min_terminated_length": 781.0, + "epoch": 1.0264797507788161, + "grad_norm": 0.5846103429794312, + "kl": 0.05215497314929962, + "learning_rate": 1.595e-06, + "loss": -0.0266, + "num_tokens": 85898593.0, + "reward": 1.5464462041854858, + "reward_std": 0.07301543653011322, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5464460253715515, + "rewards/correct_reward_func/std": 0.11028709262609482, + "step": 659 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2479.0, + "completions/max_terminated_length": 2479.0, + "completions/mean_length": 1448.4405517578125, + "completions/mean_terminated_length": 1448.4405517578125, + "completions/min_length": 602.0, + "completions/min_terminated_length": 602.0, + "epoch": 1.02803738317757, + "grad_norm": 0.6190741658210754, + "kl": 0.052393680438399315, + "learning_rate": 1.594375e-06, + "loss": -0.0017, + "num_tokens": 86026226.0, + "reward": 1.4759933948516846, + "reward_std": 0.04937182739377022, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47599339485168457, + "rewards/correct_reward_func/std": 0.13999196887016296, + "step": 660 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2861.0, + "completions/max_terminated_length": 2861.0, + "completions/mean_length": 1479.8809814453125, + "completions/mean_terminated_length": 1479.8809814453125, + "completions/min_length": 323.0, + "completions/min_terminated_length": 323.0, + "epoch": 1.029595015576324, + "grad_norm": 0.5660926699638367, + "kl": 0.05401911213994026, + "learning_rate": 1.59375e-06, + "loss": -0.0151, + "num_tokens": 86156482.0, + "reward": 1.4849108457565308, + "reward_std": 0.07375740259885788, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.48491087555885315, + "rewards/correct_reward_func/std": 0.17290189862251282, + "step": 661 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4504.0, + "completions/max_terminated_length": 4504.0, + "completions/mean_length": 1467.297607421875, + "completions/mean_terminated_length": 1467.297607421875, + "completions/min_length": 710.0, + "completions/min_terminated_length": 710.0, + "epoch": 1.0311526479750779, + "grad_norm": 0.6604934930801392, + "kl": 0.054234541952610016, + "learning_rate": 1.5931249999999999e-06, + "loss": 0.0182, + "num_tokens": 86285585.0, + "reward": 1.4347630739212036, + "reward_std": 0.0708894431591034, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4347630441188812, + "rewards/correct_reward_func/std": 0.1494257152080536, + "step": 662 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2484.0, + "completions/max_terminated_length": 2484.0, + "completions/mean_length": 1498.416748046875, + "completions/mean_terminated_length": 1498.416748046875, + "completions/min_length": 363.0, + "completions/min_terminated_length": 363.0, + "epoch": 1.0327102803738317, + "grad_norm": 0.5682238936424255, + "kl": 0.05228089354932308, + "learning_rate": 1.5925e-06, + "loss": -0.0695, + "num_tokens": 86417554.0, + "reward": 1.4704004526138306, + "reward_std": 0.09661635756492615, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4704003930091858, + "rewards/correct_reward_func/std": 0.16588261723518372, + "step": 663 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2789.0, + "completions/mean_length": 1710.15478515625, + "completions/mean_terminated_length": 1632.0601806640625, + "completions/min_length": 1021.0, + "completions/min_terminated_length": 1021.0, + "epoch": 1.0342679127725856, + "grad_norm": 0.565608024597168, + "kl": 0.05014815367758274, + "learning_rate": 1.591875e-06, + "loss": 0.0822, + "num_tokens": 86567447.0, + "reward": 1.535957932472229, + "reward_std": 0.08144499361515045, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5359576344490051, + "rewards/correct_reward_func/std": 0.1807517409324646, + "step": 664 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2085.0, + "completions/max_terminated_length": 2085.0, + "completions/mean_length": 1510.21435546875, + "completions/mean_terminated_length": 1510.21435546875, + "completions/min_length": 827.0, + "completions/min_terminated_length": 827.0, + "epoch": 1.0358255451713396, + "grad_norm": 0.5581820607185364, + "kl": 0.0530538372695446, + "learning_rate": 1.59125e-06, + "loss": -0.0007, + "num_tokens": 86700389.0, + "reward": 1.5879935026168823, + "reward_std": 0.06466535478830338, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5879934430122375, + "rewards/correct_reward_func/std": 0.17678600549697876, + "step": 665 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2300.0, + "completions/mean_length": 1603.9881591796875, + "completions/mean_terminated_length": 1524.6143798828125, + "completions/min_length": 899.0, + "completions/min_terminated_length": 899.0, + "epoch": 1.0373831775700935, + "grad_norm": 0.5471597909927368, + "kl": 0.0508502759039402, + "learning_rate": 1.590625e-06, + "loss": 0.0707, + "num_tokens": 86841100.0, + "reward": 1.536754846572876, + "reward_std": 0.0733090415596962, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5367547273635864, + "rewards/correct_reward_func/std": 0.13262318074703217, + "step": 666 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2463.0, + "completions/max_terminated_length": 2463.0, + "completions/mean_length": 1527.5833740234375, + "completions/mean_terminated_length": 1527.5833740234375, + "completions/min_length": 982.0, + "completions/min_terminated_length": 982.0, + "epoch": 1.0389408099688473, + "grad_norm": 0.5596882104873657, + "kl": 0.05326198227703571, + "learning_rate": 1.59e-06, + "loss": -0.0095, + "num_tokens": 86975351.0, + "reward": 1.5027247667312622, + "reward_std": 0.06251788139343262, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5027247071266174, + "rewards/correct_reward_func/std": 0.15841983258724213, + "step": 667 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2329.0, + "completions/max_terminated_length": 2329.0, + "completions/mean_length": 1483.297607421875, + "completions/mean_terminated_length": 1483.297607421875, + "completions/min_length": 707.0, + "completions/min_terminated_length": 707.0, + "epoch": 1.0404984423676011, + "grad_norm": 0.5668331384658813, + "kl": 0.05328033119440079, + "learning_rate": 1.589375e-06, + "loss": 0.0127, + "num_tokens": 87105834.0, + "reward": 1.5575391054153442, + "reward_std": 0.09122274816036224, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5575389862060547, + "rewards/correct_reward_func/std": 0.16401030123233795, + "step": 668 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2943.0, + "completions/max_terminated_length": 2943.0, + "completions/mean_length": 1532.452392578125, + "completions/mean_terminated_length": 1532.452392578125, + "completions/min_length": 828.0, + "completions/min_terminated_length": 828.0, + "epoch": 1.0420560747663552, + "grad_norm": 0.5774728655815125, + "kl": 0.0538950152695179, + "learning_rate": 1.58875e-06, + "loss": -0.0052, + "num_tokens": 87240410.0, + "reward": 1.4293346405029297, + "reward_std": 0.045015521347522736, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4293345808982849, + "rewards/correct_reward_func/std": 0.10337743908166885, + "step": 669 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2548.0, + "completions/max_terminated_length": 2548.0, + "completions/mean_length": 1630.46435546875, + "completions/mean_terminated_length": 1630.46435546875, + "completions/min_length": 934.0, + "completions/min_terminated_length": 934.0, + "epoch": 1.043613707165109, + "grad_norm": 0.5443136692047119, + "kl": 0.05456646718084812, + "learning_rate": 1.588125e-06, + "loss": 0.0483, + "num_tokens": 87383351.0, + "reward": 1.4322932958602905, + "reward_std": 0.10355141013860703, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.45610272884368896, + "rewards/correct_reward_func/std": 0.14299173653125763, + "step": 670 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2304.0, + "completions/mean_length": 1638.0714111328125, + "completions/mean_terminated_length": 1559.1083984375, + "completions/min_length": 924.0, + "completions/min_terminated_length": 924.0, + "epoch": 1.0451713395638629, + "grad_norm": 0.5622561573982239, + "kl": 0.05328943021595478, + "learning_rate": 1.5874999999999998e-06, + "loss": 0.0366, + "num_tokens": 87527021.0, + "reward": 1.4649417400360107, + "reward_std": 0.10010730475187302, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4768464267253876, + "rewards/correct_reward_func/std": 0.13820233941078186, + "step": 671 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2270.0, + "completions/max_terminated_length": 2270.0, + "completions/mean_length": 1578.40478515625, + "completions/mean_terminated_length": 1578.40478515625, + "completions/min_length": 988.0, + "completions/min_terminated_length": 988.0, + "epoch": 1.0467289719626167, + "grad_norm": 0.5663480758666992, + "kl": 0.055856646969914436, + "learning_rate": 1.586875e-06, + "loss": -0.0032, + "num_tokens": 87665691.0, + "reward": 1.511879324913025, + "reward_std": 0.047249529510736465, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5118792653083801, + "rewards/correct_reward_func/std": 0.18857014179229736, + "step": 672 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2368.0, + "completions/max_terminated_length": 2368.0, + "completions/mean_length": 1583.3095703125, + "completions/mean_terminated_length": 1583.3095703125, + "completions/min_length": 1053.0, + "completions/min_terminated_length": 1053.0, + "epoch": 1.0482866043613708, + "grad_norm": 0.5549023747444153, + "kl": 0.05541318096220493, + "learning_rate": 1.5862499999999998e-06, + "loss": -0.0078, + "num_tokens": 87804701.0, + "reward": 1.4863669872283936, + "reward_std": 0.08935274183750153, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.4982716739177704, + "rewards/correct_reward_func/std": 0.11579611152410507, + "step": 673 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2302.0, + "completions/max_terminated_length": 2302.0, + "completions/mean_length": 1576.75, + "completions/mean_terminated_length": 1576.75, + "completions/min_length": 821.0, + "completions/min_terminated_length": 821.0, + "epoch": 1.0498442367601246, + "grad_norm": 0.5554832220077515, + "kl": 0.056341828778386116, + "learning_rate": 1.585625e-06, + "loss": 0.0043, + "num_tokens": 87943214.0, + "reward": 1.4435389041900635, + "reward_std": 0.06262954324483871, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4435388445854187, + "rewards/correct_reward_func/std": 0.1332552284002304, + "step": 674 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2357.0, + "completions/max_terminated_length": 2357.0, + "completions/mean_length": 1566.166748046875, + "completions/mean_terminated_length": 1566.166748046875, + "completions/min_length": 881.0, + "completions/min_terminated_length": 881.0, + "epoch": 1.0514018691588785, + "grad_norm": 0.5815324187278748, + "kl": 0.05804356001317501, + "learning_rate": 1.5849999999999999e-06, + "loss": 0.0252, + "num_tokens": 88080898.0, + "reward": 1.5119918584823608, + "reward_std": 0.07926620543003082, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5238964557647705, + "rewards/correct_reward_func/std": 0.1911584585905075, + "step": 675 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2573.0, + "completions/max_terminated_length": 2573.0, + "completions/mean_length": 1587.4405517578125, + "completions/mean_terminated_length": 1587.4405517578125, + "completions/min_length": 994.0, + "completions/min_terminated_length": 994.0, + "epoch": 1.0529595015576323, + "grad_norm": 0.5776386857032776, + "kl": 0.055761074647307396, + "learning_rate": 1.584375e-06, + "loss": 0.0302, + "num_tokens": 88220291.0, + "reward": 1.4661586284637451, + "reward_std": 0.05945530906319618, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.46615859866142273, + "rewards/correct_reward_func/std": 0.11711110919713974, + "step": 676 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3017.0, + "completions/max_terminated_length": 3017.0, + "completions/mean_length": 1575.71435546875, + "completions/mean_terminated_length": 1575.71435546875, + "completions/min_length": 1013.0, + "completions/min_terminated_length": 1013.0, + "epoch": 1.0545171339563864, + "grad_norm": 0.5745882987976074, + "kl": 0.053962018340826035, + "learning_rate": 1.5837499999999999e-06, + "loss": 0.0093, + "num_tokens": 88358657.0, + "reward": 1.5064351558685303, + "reward_std": 0.06469320505857468, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5064350962638855, + "rewards/correct_reward_func/std": 0.15094655752182007, + "step": 677 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2058.0, + "completions/max_terminated_length": 2058.0, + "completions/mean_length": 1459.84521484375, + "completions/mean_terminated_length": 1459.84521484375, + "completions/min_length": 728.0, + "completions/min_terminated_length": 728.0, + "epoch": 1.0560747663551402, + "grad_norm": 0.5632085800170898, + "kl": 0.0573277622461319, + "learning_rate": 1.5831249999999998e-06, + "loss": -0.0196, + "num_tokens": 88487278.0, + "reward": 1.5942164659500122, + "reward_std": 0.08631883561611176, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.6061212420463562, + "rewards/correct_reward_func/std": 0.1588619351387024, + "step": 678 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2402.0, + "completions/max_terminated_length": 2402.0, + "completions/mean_length": 1549.75, + "completions/mean_terminated_length": 1549.75, + "completions/min_length": 832.0, + "completions/min_terminated_length": 832.0, + "epoch": 1.057632398753894, + "grad_norm": 0.5981232523918152, + "kl": 0.05635823681950569, + "learning_rate": 1.5824999999999999e-06, + "loss": 0.0156, + "num_tokens": 88623529.0, + "reward": 1.4801684617996216, + "reward_std": 0.04865288734436035, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4801684021949768, + "rewards/correct_reward_func/std": 0.14745546877384186, + "step": 679 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2477.0, + "completions/mean_length": 1631.0714111328125, + "completions/mean_terminated_length": 1552.0240478515625, + "completions/min_length": 865.0, + "completions/min_terminated_length": 865.0, + "epoch": 1.0591900311526479, + "grad_norm": 0.5360672473907471, + "kl": 0.05281771533191204, + "learning_rate": 1.5818749999999998e-06, + "loss": 0.0423, + "num_tokens": 88766485.0, + "reward": 1.507429599761963, + "reward_std": 0.05878061428666115, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5074294805526733, + "rewards/correct_reward_func/std": 0.1723850518465042, + "step": 680 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2320.0, + "completions/max_terminated_length": 2320.0, + "completions/mean_length": 1525.7381591796875, + "completions/mean_terminated_length": 1525.7381591796875, + "completions/min_length": 847.0, + "completions/min_terminated_length": 847.0, + "epoch": 1.060747663551402, + "grad_norm": 0.5318810939788818, + "kl": 0.055077340453863144, + "learning_rate": 1.58125e-06, + "loss": -0.0029, + "num_tokens": 88900719.0, + "reward": 1.5089659690856934, + "reward_std": 0.060308195650577545, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5089658498764038, + "rewards/correct_reward_func/std": 0.17798349261283875, + "step": 681 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2428.0, + "completions/max_terminated_length": 2428.0, + "completions/mean_length": 1492.8214111328125, + "completions/mean_terminated_length": 1492.8214111328125, + "completions/min_length": 956.0, + "completions/min_terminated_length": 956.0, + "epoch": 1.0623052959501558, + "grad_norm": 0.6031827330589294, + "kl": 0.05352173000574112, + "learning_rate": 1.5806249999999998e-06, + "loss": -0.0015, + "num_tokens": 89031852.0, + "reward": 1.4641478061676025, + "reward_std": 0.06496328860521317, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4641478359699249, + "rewards/correct_reward_func/std": 0.1625915914773941, + "step": 682 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2383.0, + "completions/mean_length": 1627.09521484375, + "completions/mean_terminated_length": 1548.0, + "completions/min_length": 963.0, + "completions/min_terminated_length": 963.0, + "epoch": 1.0638629283489096, + "grad_norm": 0.5318005084991455, + "kl": 0.05356441251933575, + "learning_rate": 1.58e-06, + "loss": 0.0701, + "num_tokens": 89174522.0, + "reward": 1.5102105140686035, + "reward_std": 0.09376493096351624, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.5221152305603027, + "rewards/correct_reward_func/std": 0.16014137864112854, + "step": 683 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2362.0, + "completions/mean_length": 1641.2381591796875, + "completions/mean_terminated_length": 1562.313232421875, + "completions/min_length": 961.0, + "completions/min_terminated_length": 961.0, + "epoch": 1.0654205607476634, + "grad_norm": 0.5863988995552063, + "kl": 0.055096494033932686, + "learning_rate": 1.5793749999999998e-06, + "loss": 0.0739, + "num_tokens": 89318332.0, + "reward": 1.4779866933822632, + "reward_std": 0.0922832116484642, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.48989132046699524, + "rewards/correct_reward_func/std": 0.1688835322856903, + "step": 684 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2260.0, + "completions/max_terminated_length": 2260.0, + "completions/mean_length": 1586.9405517578125, + "completions/mean_terminated_length": 1586.9405517578125, + "completions/min_length": 1013.0, + "completions/min_terminated_length": 1013.0, + "epoch": 1.0669781931464175, + "grad_norm": 0.5507916212081909, + "kl": 0.05459226667881012, + "learning_rate": 1.5787500000000001e-06, + "loss": -0.0182, + "num_tokens": 89457557.0, + "reward": 1.543197512626648, + "reward_std": 0.06276614218950272, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5431973338127136, + "rewards/correct_reward_func/std": 0.18692582845687866, + "step": 685 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 4759.0, + "completions/max_terminated_length": 4759.0, + "completions/mean_length": 1548.416748046875, + "completions/mean_terminated_length": 1548.416748046875, + "completions/min_length": 848.0, + "completions/min_terminated_length": 848.0, + "epoch": 1.0685358255451713, + "grad_norm": 0.5588350296020508, + "kl": 0.05437791533768177, + "learning_rate": 1.578125e-06, + "loss": -0.0017, + "num_tokens": 89593504.0, + "reward": 1.4723842144012451, + "reward_std": 0.06399713456630707, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4723840355873108, + "rewards/correct_reward_func/std": 0.15390437841415405, + "step": 686 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3083.0, + "completions/max_terminated_length": 3083.0, + "completions/mean_length": 1471.7738037109375, + "completions/mean_terminated_length": 1471.7738037109375, + "completions/min_length": 437.0, + "completions/min_terminated_length": 437.0, + "epoch": 1.0700934579439252, + "grad_norm": 0.6055949926376343, + "kl": 0.05486376769840717, + "learning_rate": 1.5775e-06, + "loss": -0.0051, + "num_tokens": 89723055.0, + "reward": 1.460752010345459, + "reward_std": 0.09401778876781464, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.472656786441803, + "rewards/correct_reward_func/std": 0.18991245329380035, + "step": 687 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2914.0, + "completions/max_terminated_length": 2914.0, + "completions/mean_length": 1501.357177734375, + "completions/mean_terminated_length": 1501.357177734375, + "completions/min_length": 852.0, + "completions/min_terminated_length": 852.0, + "epoch": 1.071651090342679, + "grad_norm": 0.6138256788253784, + "kl": 0.0549286063760519, + "learning_rate": 1.576875e-06, + "loss": 0.0095, + "num_tokens": 89855031.0, + "reward": 1.452558994293213, + "reward_std": 0.062143485993146896, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.45255884528160095, + "rewards/correct_reward_func/std": 0.14738577604293823, + "step": 688 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2026.0, + "completions/max_terminated_length": 2026.0, + "completions/mean_length": 1508.857177734375, + "completions/mean_terminated_length": 1508.857177734375, + "completions/min_length": 958.0, + "completions/min_terminated_length": 958.0, + "epoch": 1.073208722741433, + "grad_norm": 0.5856159329414368, + "kl": 0.05561050772666931, + "learning_rate": 1.57625e-06, + "loss": 0.0011, + "num_tokens": 89987721.0, + "reward": 1.4332712888717651, + "reward_std": 0.04271453246474266, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4332713186740875, + "rewards/correct_reward_func/std": 0.11550971120595932, + "step": 689 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2183.0, + "completions/max_terminated_length": 2183.0, + "completions/mean_length": 1527.1785888671875, + "completions/mean_terminated_length": 1527.1785888671875, + "completions/min_length": 890.0, + "completions/min_terminated_length": 890.0, + "epoch": 1.074766355140187, + "grad_norm": 0.6025081276893616, + "kl": 0.05726106837391853, + "learning_rate": 1.575625e-06, + "loss": 0.0185, + "num_tokens": 90121794.0, + "reward": 1.4257436990737915, + "reward_std": 0.09166575968265533, + "rewards/contains_chinese/mean": 0.988095223903656, + "rewards/contains_chinese/std": 0.10910894721746445, + "rewards/correct_reward_func/mean": 0.43764835596084595, + "rewards/correct_reward_func/std": 0.14188778400421143, + "step": 690 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2313.0, + "completions/max_terminated_length": 2313.0, + "completions/mean_length": 1449.4761962890625, + "completions/mean_terminated_length": 1449.4761962890625, + "completions/min_length": 971.0, + "completions/min_terminated_length": 971.0, + "epoch": 1.0763239875389408, + "grad_norm": 0.6137524843215942, + "kl": 0.059694841504096985, + "learning_rate": 1.575e-06, + "loss": -0.0008, + "num_tokens": 90249376.0, + "reward": 1.5255221128463745, + "reward_std": 0.05913606286048889, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5255220532417297, + "rewards/correct_reward_func/std": 0.1678466647863388, + "step": 691 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2680.0, + "completions/max_terminated_length": 2680.0, + "completions/mean_length": 1516.2261962890625, + "completions/mean_terminated_length": 1516.2261962890625, + "completions/min_length": 878.0, + "completions/min_terminated_length": 878.0, + "epoch": 1.0778816199376946, + "grad_norm": 0.5627336502075195, + "kl": 0.052463850006461143, + "learning_rate": 1.574375e-06, + "loss": 0.0015, + "num_tokens": 90382571.0, + "reward": 1.518011450767517, + "reward_std": 0.06652691215276718, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5180113911628723, + "rewards/correct_reward_func/std": 0.13873633742332458, + "step": 692 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.011904761904761862, + "completions/max_length": 8192.0, + "completions/max_terminated_length": 2380.0, + "completions/mean_length": 1571.6429443359375, + "completions/mean_terminated_length": 1491.8795166015625, + "completions/min_length": 993.0, + "completions/min_terminated_length": 993.0, + "epoch": 1.0794392523364487, + "grad_norm": 0.5776975154876709, + "kl": 0.05454135872423649, + "learning_rate": 1.57375e-06, + "loss": 0.0496, + "num_tokens": 90520505.0, + "reward": 1.4717916250228882, + "reward_std": 0.09959909319877625, + "rewards/contains_chinese/mean": 0.976190447807312, + "rewards/contains_chinese/std": 0.15337100625038147, + "rewards/correct_reward_func/mean": 0.4956010580062866, + "rewards/correct_reward_func/std": 0.12597030401229858, + "step": 693 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1938.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 1448.7381591796875, + "completions/mean_terminated_length": 1448.7381591796875, + "completions/min_length": 989.0, + "completions/min_terminated_length": 989.0, + "epoch": 1.0809968847352025, + "grad_norm": 0.6719677448272705, + "kl": 0.054130397737026215, + "learning_rate": 1.573125e-06, + "loss": 0.0117, + "num_tokens": 90648091.0, + "reward": 1.4858334064483643, + "reward_std": 0.048804186284542084, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4858333170413971, + "rewards/correct_reward_func/std": 0.12014701217412949, + "step": 694 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2097.0, + "completions/max_terminated_length": 2097.0, + "completions/mean_length": 1482.857177734375, + "completions/mean_terminated_length": 1482.857177734375, + "completions/min_length": 882.0, + "completions/min_terminated_length": 882.0, + "epoch": 1.0825545171339563, + "grad_norm": 0.605505645275116, + "kl": 0.05565035529434681, + "learning_rate": 1.5725e-06, + "loss": 0.0116, + "num_tokens": 90778639.0, + "reward": 1.4878506660461426, + "reward_std": 0.05565100908279419, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4878506362438202, + "rewards/correct_reward_func/std": 0.15627160668373108, + "step": 695 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 6738.0, + "completions/max_terminated_length": 6738.0, + "completions/mean_length": 1650.5, + "completions/mean_terminated_length": 1650.5, + "completions/min_length": 1163.0, + "completions/min_terminated_length": 1163.0, + "epoch": 1.0841121495327102, + "grad_norm": 0.5941537618637085, + "kl": 0.051503732800483704, + "learning_rate": 1.5718749999999999e-06, + "loss": -0.0238, + "num_tokens": 90923377.0, + "reward": 1.477596640586853, + "reward_std": 0.056624628603458405, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.4775967299938202, + "rewards/correct_reward_func/std": 0.1360875815153122, + "step": 696 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2387.0, + "completions/max_terminated_length": 2387.0, + "completions/mean_length": 1535.0, + "completions/mean_terminated_length": 1535.0, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 1.0856697819314642, + "grad_norm": 0.5858432054519653, + "kl": 0.053479718044400215, + "learning_rate": 1.57125e-06, + "loss": 0.0207, + "num_tokens": 91058353.0, + "reward": 1.4727427959442139, + "reward_std": 0.040079839527606964, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.47274258732795715, + "rewards/correct_reward_func/std": 0.10987861454486847, + "step": 697 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2683.0, + "completions/max_terminated_length": 2683.0, + "completions/mean_length": 1514.5357666015625, + "completions/mean_terminated_length": 1514.5357666015625, + "completions/min_length": 1071.0, + "completions/min_terminated_length": 1071.0, + "epoch": 1.087227414330218, + "grad_norm": 0.5789026618003845, + "kl": 0.054706670343875885, + "learning_rate": 1.5706249999999999e-06, + "loss": 0.0078, + "num_tokens": 91191598.0, + "reward": 1.528199315071106, + "reward_std": 0.13074904680252075, + "rewards/contains_chinese/mean": 0.9642857313156128, + "rewards/contains_chinese/std": 0.18669146299362183, + "rewards/correct_reward_func/mean": 0.5639137029647827, + "rewards/correct_reward_func/std": 0.17987313866615295, + "step": 698 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2442.0, + "completions/max_terminated_length": 2442.0, + "completions/mean_length": 1555.3333740234375, + "completions/mean_terminated_length": 1555.3333740234375, + "completions/min_length": 1020.0, + "completions/min_terminated_length": 1020.0, + "epoch": 1.088785046728972, + "grad_norm": 0.551451563835144, + "kl": 0.05342511832714081, + "learning_rate": 1.57e-06, + "loss": -0.0102, + "num_tokens": 91328390.0, + "reward": 1.528260588645935, + "reward_std": 0.05848051980137825, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5282606482505798, + "rewards/correct_reward_func/std": 0.16042807698249817, + "step": 699 + }, + { + "clip_ratio": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2572.0, + "completions/max_terminated_length": 2572.0, + "completions/mean_length": 1527.8333740234375, + "completions/mean_terminated_length": 1527.8333740234375, + "completions/min_length": 1004.0, + "completions/min_terminated_length": 1004.0, + "epoch": 1.0903426791277258, + "grad_norm": 0.58552086353302, + "kl": 0.05442274548113346, + "learning_rate": 1.569375e-06, + "loss": 0.029, + "num_tokens": 91462740.0, + "reward": 1.5053088665008545, + "reward_std": 0.06954223662614822, + "rewards/contains_chinese/mean": 1.0, + "rewards/contains_chinese/std": 0.0, + "rewards/correct_reward_func/mean": 0.5053088068962097, + "rewards/correct_reward_func/std": 0.16963563859462738, + "step": 700 + } + ], + "logging_steps": 1.0, + "max_steps": 3210, + "num_input_tokens_seen": 91462740, + "num_train_epochs": 5, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}