{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.17363183539702004, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4441964285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3034.0, "completions/mean_length": 2116.3349609375, "completions/mean_terminated_length": 1352.5701904296875, "completions/min_length": 58.0, "completions/min_terminated_length": 58.0, "epoch": 0.00017363183539702005, "grad_norm": 0.07518604397773743, "kl": 0.0005568861961364746, "learning_rate": 0.0, "loss": 0.0094, "num_tokens": 1010302.0, "reward": 0.1674107164144516, "reward_std": 0.18299873173236847, "rewards/accuracy_reward/mean": 0.1674107164144516, "rewards/accuracy_reward/std": 0.37375950813293457, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 0.0003472636707940401, "grad_norm": 0.07518283277750015, "kl": 0.0005568861961364746, "learning_rate": 1e-08, "loss": 0.0094, "step": 2 }, { "clip_ratio/high_max": 0.002761595580523135, "clip_ratio/high_mean": 0.0006011565012613573, "clip_ratio/low_mean": 0.0005826481212807266, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011838046184493578, "epoch": 0.0005208955061910602, "grad_norm": 0.07426605373620987, "kl": 0.0005519986152648926, "learning_rate": 2e-08, "loss": 0.0095, "step": 3 }, { "clip_ratio/high_max": 0.002850780623703031, "clip_ratio/high_mean": 0.0006112051714808331, "clip_ratio/low_mean": 0.0006351759872131879, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012463811335692299, "epoch": 0.0006945273415880802, "grad_norm": 0.07256410270929337, "kl": 0.000558316707611084, "learning_rate": 3e-08, "loss": 0.0095, "step": 4 }, { "clip_ratio/high_max": 0.0025070771444006823, "clip_ratio/high_mean": 0.000573295107415106, "clip_ratio/low_mean": 0.0005826076381936218, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011559027507246356, "epoch": 0.0008681591769851003, "grad_norm": 0.07427840679883957, "kl": 0.0005527734756469727, "learning_rate": 4e-08, "loss": 0.0095, "step": 5 }, { "clip_ratio/high_max": 0.002500095830328064, "clip_ratio/high_mean": 0.0005379317078677559, "clip_ratio/low_mean": 0.000593781854036024, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011317135631543351, "epoch": 0.0010417910123821203, "grad_norm": 0.0723797082901001, "kl": 0.000549614429473877, "learning_rate": 5e-08, "loss": 0.0095, "step": 6 }, { "clip_ratio/high_max": 0.002905295172240585, "clip_ratio/high_mean": 0.0006076123838738567, "clip_ratio/low_mean": 0.0006244990290724672, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012321114154474344, "epoch": 0.0012154228477791404, "grad_norm": 0.0806439220905304, "kl": 0.0005562901496887207, "learning_rate": 6e-08, "loss": 0.0095, "step": 7 }, { "clip_ratio/high_max": 0.003172771801473573, "clip_ratio/high_mean": 0.0006794932462526049, "clip_ratio/low_mean": 0.0006221751518751262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013016684001740941, "epoch": 0.0013890546831761604, "grad_norm": 0.07314153015613556, "kl": 0.0005549788475036621, "learning_rate": 7e-08, "loss": 0.0095, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5223214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 2336.9443359375, "completions/mean_terminated_length": 1533.1915283203125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.0015626865185731805, "grad_norm": 0.06706323474645615, "kl": 0.0005805492401123047, "learning_rate": 8e-08, "loss": 0.0264, "num_tokens": 2119453.0, "reward": 0.1227678656578064, "reward_std": 0.17434383928775787, "rewards/accuracy_reward/mean": 0.1227678582072258, "rewards/accuracy_reward/std": 0.3285374045372009, "step": 9 }, { "clip_ratio/high_max": 0.0026368980325059965, "clip_ratio/high_mean": 0.0005842189798386244, "clip_ratio/low_mean": 0.000646334178327379, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012305531649872137, "epoch": 0.0017363183539702005, "grad_norm": 0.06471949815750122, "kl": 0.0005822181701660156, "learning_rate": 9e-08, "loss": 0.0264, "step": 10 }, { "clip_ratio/high_max": 0.0026540827129792888, "clip_ratio/high_mean": 0.0005684592592842819, "clip_ratio/low_mean": 0.0005987331092001114, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011671923834910558, "epoch": 0.0019099501893672206, "grad_norm": 0.06459251791238785, "kl": 0.0005718469619750977, "learning_rate": 1e-07, "loss": 0.0264, "step": 11 }, { "clip_ratio/high_max": 0.0023983507580851438, "clip_ratio/high_mean": 0.000510746252075478, "clip_ratio/low_mean": 0.0006056880938558606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011164343591190118, "epoch": 0.0020835820247642406, "grad_norm": 0.06531789898872375, "kl": 0.0005796551704406738, "learning_rate": 1.0999999999999999e-07, "loss": 0.0264, "step": 12 }, { "clip_ratio/high_max": 0.002435275131574599, "clip_ratio/high_mean": 0.0005039215056967805, "clip_ratio/low_mean": 0.0006134185300652462, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011173400330335426, "epoch": 0.0022572138601612605, "grad_norm": 0.06517336517572403, "kl": 0.0005812644958496094, "learning_rate": 1.2e-07, "loss": 0.0264, "step": 13 }, { "clip_ratio/high_max": 0.002798363788315328, "clip_ratio/high_mean": 0.0005659126600221498, "clip_ratio/low_mean": 0.0006484390246441762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001214351666476432, "epoch": 0.0024308456955582807, "grad_norm": 0.06416178494691849, "kl": 0.000581204891204834, "learning_rate": 1.3e-07, "loss": 0.0265, "step": 14 }, { "clip_ratio/high_max": 0.002599140078018536, "clip_ratio/high_mean": 0.0005507599985321576, "clip_ratio/low_mean": 0.000657936292100203, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012086962783541821, "epoch": 0.0026044775309553006, "grad_norm": 0.06407883018255234, "kl": 0.0005803108215332031, "learning_rate": 1.4e-07, "loss": 0.0264, "step": 15 }, { "clip_ratio/high_max": 0.0026806539590324974, "clip_ratio/high_mean": 0.0005890884217478742, "clip_ratio/low_mean": 0.0006960580662962457, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012851465044150245, "epoch": 0.002778109366352321, "grad_norm": 0.06386357545852661, "kl": 0.0005791187286376953, "learning_rate": 1.5e-07, "loss": 0.0264, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4263392857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3053.0, "completions/mean_length": 2144.34619140625, "completions/mean_terminated_length": 1454.922119140625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.0029517412017493407, "grad_norm": 0.07487935572862625, "kl": 0.0005636215209960938, "learning_rate": 1.6e-07, "loss": 0.0139, "num_tokens": 3146352.0, "reward": 0.2053571492433548, "reward_std": 0.2151494175195694, "rewards/accuracy_reward/mean": 0.2053571492433548, "rewards/accuracy_reward/std": 0.40441396832466125, "step": 17 }, { "clip_ratio/high_max": 0.004256080117102101, "clip_ratio/high_mean": 0.0010428213320210489, "clip_ratio/low_mean": 0.0005661480897742877, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016089694317997782, "epoch": 0.003125373037146361, "grad_norm": 0.07354291528463364, "kl": 0.0005640387535095215, "learning_rate": 1.7000000000000001e-07, "loss": 0.014, "step": 18 }, { "clip_ratio/high_max": 0.003962268418945314, "clip_ratio/high_mean": 0.0010323274457277876, "clip_ratio/low_mean": 0.0005695096572253533, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001601837143425655, "epoch": 0.0032990048725433808, "grad_norm": 0.07306510210037231, "kl": 0.0005638599395751953, "learning_rate": 1.8e-07, "loss": 0.014, "step": 19 }, { "clip_ratio/high_max": 0.004173379335952632, "clip_ratio/high_mean": 0.0010520144671772869, "clip_ratio/low_mean": 0.000569460580663872, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016214750521612586, "epoch": 0.003472636707940401, "grad_norm": 0.07447966933250427, "kl": 0.0005657672882080078, "learning_rate": 1.8999999999999998e-07, "loss": 0.014, "step": 20 }, { "clip_ratio/high_max": 0.004181353541753197, "clip_ratio/high_mean": 0.0010200247618286085, "clip_ratio/low_mean": 0.0005712788120035839, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001591303580426029, "epoch": 0.003646268543337421, "grad_norm": 0.07602304220199585, "kl": 0.0005609989166259766, "learning_rate": 2e-07, "loss": 0.014, "step": 21 }, { "clip_ratio/high_max": 0.004171040527580772, "clip_ratio/high_mean": 0.0010476721749910212, "clip_ratio/low_mean": 0.0005688368451046699, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001616509035557101, "epoch": 0.003819900378734441, "grad_norm": 0.0725431740283966, "kl": 0.0005650520324707031, "learning_rate": 2.0999999999999997e-07, "loss": 0.014, "step": 22 }, { "clip_ratio/high_max": 0.004245861861818412, "clip_ratio/high_mean": 0.0010544555927936017, "clip_ratio/low_mean": 0.0005829912740864529, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016374468814319698, "epoch": 0.003993532214131461, "grad_norm": 0.07230717688798904, "kl": 0.0005667209625244141, "learning_rate": 2.1999999999999998e-07, "loss": 0.014, "step": 23 }, { "clip_ratio/high_max": 0.004032027581615694, "clip_ratio/high_mean": 0.0010021264564556986, "clip_ratio/low_mean": 0.0005987712622754771, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016008977422643511, "epoch": 0.004167164049528481, "grad_norm": 0.07265053689479828, "kl": 0.0005652904510498047, "learning_rate": 2.3e-07, "loss": 0.014, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4553571428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 2129.13623046875, "completions/mean_terminated_length": 1340.840087890625, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.0043407958849255015, "grad_norm": 0.08780146390199661, "kl": 0.0005834698677062988, "learning_rate": 2.4e-07, "loss": 0.0122, "num_tokens": 4159549.0, "reward": 0.1941964328289032, "reward_std": 0.22552181780338287, "rewards/accuracy_reward/mean": 0.1941964328289032, "rewards/accuracy_reward/std": 0.3960230350494385, "step": 25 }, { "clip_ratio/high_max": 0.0039483520322392, "clip_ratio/high_mean": 0.000888818555040416, "clip_ratio/low_mean": 0.0008453624525373016, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017341810234938748, "epoch": 0.004514427720322521, "grad_norm": 0.08430933952331543, "kl": 0.0006339550018310547, "learning_rate": 2.5e-07, "loss": 0.0123, "step": 26 }, { "clip_ratio/high_max": 0.004146012477576733, "clip_ratio/high_mean": 0.0009395621118528652, "clip_ratio/low_mean": 0.0008747417773520283, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018143038723792415, "epoch": 0.004688059555719541, "grad_norm": 0.08639360964298248, "kl": 0.0006011128425598145, "learning_rate": 2.6e-07, "loss": 0.0123, "step": 27 }, { "clip_ratio/high_max": 0.003815489564658492, "clip_ratio/high_mean": 0.0008542888392639725, "clip_ratio/low_mean": 0.0008499759985625133, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017042648323695175, "epoch": 0.0048616913911165615, "grad_norm": 0.08524786680936813, "kl": 0.0005953311920166016, "learning_rate": 2.7e-07, "loss": 0.0123, "step": 28 }, { "clip_ratio/high_max": 0.0036790908943657996, "clip_ratio/high_mean": 0.0008685182651788637, "clip_ratio/low_mean": 0.0009222826415680174, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017908008921949659, "epoch": 0.005035323226513581, "grad_norm": 0.08499059826135635, "kl": 0.0005950927734375, "learning_rate": 2.8e-07, "loss": 0.0123, "step": 29 }, { "clip_ratio/high_max": 0.0037829569573659683, "clip_ratio/high_mean": 0.0008689734193012555, "clip_ratio/low_mean": 0.0008786804153260164, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017476538196206093, "epoch": 0.005208955061910601, "grad_norm": 0.08499704301357269, "kl": 0.0005869865417480469, "learning_rate": 2.9e-07, "loss": 0.0122, "step": 30 }, { "clip_ratio/high_max": 0.004097796128917253, "clip_ratio/high_mean": 0.0009012588461700943, "clip_ratio/low_mean": 0.0008530217692168662, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017542806008350453, "epoch": 0.005382586897307621, "grad_norm": 0.0847180038690567, "kl": 0.0005908012390136719, "learning_rate": 3e-07, "loss": 0.0123, "step": 31 }, { "clip_ratio/high_max": 0.0036784400726901367, "clip_ratio/high_mean": 0.0008380539102290641, "clip_ratio/low_mean": 0.0008431689620920224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016812228814160335, "epoch": 0.005556218732704642, "grad_norm": 0.0857548639178276, "kl": 0.0006113052368164062, "learning_rate": 3.1e-07, "loss": 0.0123, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 2151.72119140625, "completions/mean_terminated_length": 1486.2884521484375, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.005729850568101661, "grad_norm": 0.07399984449148178, "kl": 0.0005936026573181152, "learning_rate": 3.2e-07, "loss": 0.0111, "num_tokens": 5189624.0, "reward": 0.1763392984867096, "reward_std": 0.20950856804847717, "rewards/accuracy_reward/mean": 0.1763392835855484, "rewards/accuracy_reward/std": 0.3815346360206604, "step": 33 }, { "clip_ratio/high_max": 0.003883997267621453, "clip_ratio/high_mean": 0.0008249341167356761, "clip_ratio/low_mean": 0.0007022772615528083, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015272113905666629, "epoch": 0.005903482403498681, "grad_norm": 0.07179909199476242, "kl": 0.0005904436111450195, "learning_rate": 3.3e-07, "loss": 0.0112, "step": 34 }, { "clip_ratio/high_max": 0.0039041672425810248, "clip_ratio/high_mean": 0.0008337087383551989, "clip_ratio/low_mean": 0.0007004864901318797, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015341952275775839, "epoch": 0.006077114238895702, "grad_norm": 0.07258858531713486, "kl": 0.0005950927734375, "learning_rate": 3.4000000000000003e-07, "loss": 0.0112, "step": 35 }, { "clip_ratio/high_max": 0.003963205188483698, "clip_ratio/high_mean": 0.0008410395403188886, "clip_ratio/low_mean": 0.0006981797928347078, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015392193472507643, "epoch": 0.006250746074292722, "grad_norm": 0.07266177982091904, "kl": 0.0005968809127807617, "learning_rate": 3.5e-07, "loss": 0.0112, "step": 36 }, { "clip_ratio/high_max": 0.004058662649185862, "clip_ratio/high_mean": 0.0008566417291149264, "clip_ratio/low_mean": 0.0007481297507183626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016047714580054162, "epoch": 0.006424377909689741, "grad_norm": 0.07100122421979904, "kl": 0.0006065964698791504, "learning_rate": 3.6e-07, "loss": 0.0111, "step": 37 }, { "clip_ratio/high_max": 0.003824255156359868, "clip_ratio/high_mean": 0.0008116020612760622, "clip_ratio/low_mean": 0.0006907942204179562, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015023962769191712, "epoch": 0.0065980097450867616, "grad_norm": 0.07112527638673782, "kl": 0.0006021857261657715, "learning_rate": 3.7e-07, "loss": 0.0111, "step": 38 }, { "clip_ratio/high_max": 0.003947917517507449, "clip_ratio/high_mean": 0.0008612708024884341, "clip_ratio/low_mean": 0.0007247414524726992, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015860122657613829, "epoch": 0.006771641580483782, "grad_norm": 0.0719962865114212, "kl": 0.0006005167961120605, "learning_rate": 3.7999999999999996e-07, "loss": 0.0111, "step": 39 }, { "clip_ratio/high_max": 0.004301487875636667, "clip_ratio/high_mean": 0.0008830179140204564, "clip_ratio/low_mean": 0.0007119847286958247, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015950026518112281, "epoch": 0.006945273415880802, "grad_norm": 0.07074670493602753, "kl": 0.000606238842010498, "learning_rate": 3.8999999999999997e-07, "loss": 0.0111, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4196428571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 2098.27685546875, "completions/mean_terminated_length": 1394.199951171875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "epoch": 0.0071189052512778215, "grad_norm": 0.08379006385803223, "kl": 0.0006322860717773438, "learning_rate": 4e-07, "loss": 0.0223, "num_tokens": 6191444.0, "reward": 0.1785714328289032, "reward_std": 0.19681887328624725, "rewards/accuracy_reward/mean": 0.1785714328289032, "rewards/accuracy_reward/std": 0.3834211826324463, "step": 41 }, { "clip_ratio/high_max": 0.0032768902128736954, "clip_ratio/high_mean": 0.0007902619381638942, "clip_ratio/low_mean": 0.0007939784009067807, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015842403699934948, "epoch": 0.007292537086674842, "grad_norm": 0.07971406728029251, "kl": 0.0006505250930786133, "learning_rate": 4.0999999999999994e-07, "loss": 0.0224, "step": 42 }, { "clip_ratio/high_max": 0.0030777994688833132, "clip_ratio/high_mean": 0.0007435852414801047, "clip_ratio/low_mean": 0.000818575064840843, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015621603602085088, "epoch": 0.007466168922071862, "grad_norm": 0.08073156327009201, "kl": 0.000640869140625, "learning_rate": 4.1999999999999995e-07, "loss": 0.0224, "step": 43 }, { "clip_ratio/high_max": 0.003409467533856514, "clip_ratio/high_mean": 0.0007539876567079773, "clip_ratio/low_mean": 0.0007934697282507841, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015474573956453241, "epoch": 0.007639800757468882, "grad_norm": 0.08091849088668823, "kl": 0.0006548166275024414, "learning_rate": 4.2999999999999996e-07, "loss": 0.0223, "step": 44 }, { "clip_ratio/high_max": 0.0033135061530629173, "clip_ratio/high_mean": 0.0007845253235245764, "clip_ratio/low_mean": 0.0007589884821754822, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015435137856911751, "epoch": 0.007813432592865903, "grad_norm": 0.07926131784915924, "kl": 0.0006515979766845703, "learning_rate": 4.3999999999999997e-07, "loss": 0.0224, "step": 45 }, { "clip_ratio/high_max": 0.0031372296980407555, "clip_ratio/high_mean": 0.0007492986342185759, "clip_ratio/low_mean": 0.0007901795224825037, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015394781557915849, "epoch": 0.007987064428262922, "grad_norm": 0.0811559185385704, "kl": 0.0006567239761352539, "learning_rate": 4.5e-07, "loss": 0.0223, "step": 46 }, { "clip_ratio/high_max": 0.0029638049818458967, "clip_ratio/high_mean": 0.0006961375756873167, "clip_ratio/low_mean": 0.000848244901135331, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015443824686371954, "epoch": 0.008160696263659941, "grad_norm": 0.08105692267417908, "kl": 0.0006681680679321289, "learning_rate": 4.6e-07, "loss": 0.0223, "step": 47 }, { "clip_ratio/high_max": 0.0032189841967920074, "clip_ratio/high_mean": 0.0007669620920296438, "clip_ratio/low_mean": 0.000905851822153636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016728139235056005, "epoch": 0.008334328099056963, "grad_norm": 0.08049653470516205, "kl": 0.0006657838821411133, "learning_rate": 4.6999999999999995e-07, "loss": 0.0223, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4151785714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 2085.868408203125, "completions/mean_terminated_length": 1385.7900390625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.008507959934453982, "grad_norm": 0.08103679120540619, "kl": 0.0006326436996459961, "learning_rate": 4.8e-07, "loss": 0.0307, "num_tokens": 7189105.0, "reward": 0.2232142984867096, "reward_std": 0.20493680238723755, "rewards/accuracy_reward/mean": 0.2232142835855484, "rewards/accuracy_reward/std": 0.41686636209487915, "step": 49 }, { "clip_ratio/high_max": 0.0031352953919849824, "clip_ratio/high_mean": 0.0007364315333688864, "clip_ratio/low_mean": 0.0008790218768126579, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016154534050656366, "epoch": 0.008681591769851003, "grad_norm": 0.07928720861673355, "kl": 0.0006400346755981445, "learning_rate": 4.9e-07, "loss": 0.0308, "step": 50 }, { "clip_ratio/high_max": 0.002983226575452136, "clip_ratio/high_mean": 0.0007346476877501118, "clip_ratio/low_mean": 0.0008209199215798435, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015555676222902548, "epoch": 0.008855223605248022, "grad_norm": 0.07820896804332733, "kl": 0.0006337165832519531, "learning_rate": 5e-07, "loss": 0.0308, "step": 51 }, { "clip_ratio/high_max": 0.003392001093743602, "clip_ratio/high_mean": 0.0008124500823214476, "clip_ratio/low_mean": 0.0008540135531802662, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016664636768837227, "epoch": 0.009028855440645042, "grad_norm": 0.07736483216285706, "kl": 0.0006496906280517578, "learning_rate": 5.1e-07, "loss": 0.0307, "step": 52 }, { "clip_ratio/high_max": 0.0034579251041577663, "clip_ratio/high_mean": 0.000829722481284989, "clip_ratio/low_mean": 0.0008641391150376876, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016938615726758144, "epoch": 0.009202487276042063, "grad_norm": 0.07726527750492096, "kl": 0.0006527900695800781, "learning_rate": 5.2e-07, "loss": 0.0307, "step": 53 }, { "clip_ratio/high_max": 0.003214377589756623, "clip_ratio/high_mean": 0.000810741222494471, "clip_ratio/low_mean": 0.0008931753927754471, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017039166214090073, "epoch": 0.009376119111439082, "grad_norm": 0.08058212697505951, "kl": 0.0006560087203979492, "learning_rate": 5.3e-07, "loss": 0.0307, "step": 54 }, { "clip_ratio/high_max": 0.003127804094219755, "clip_ratio/high_mean": 0.0007541291345205536, "clip_ratio/low_mean": 0.0009171056817649514, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016712348242435837, "epoch": 0.009549750946836102, "grad_norm": 0.0762295350432396, "kl": 0.0006685256958007812, "learning_rate": 5.4e-07, "loss": 0.0307, "step": 55 }, { "clip_ratio/high_max": 0.003333622560603544, "clip_ratio/high_mean": 0.0008228500832956342, "clip_ratio/low_mean": 0.0009352699062219472, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017581199717824347, "epoch": 0.009723382782233123, "grad_norm": 0.07678646594285965, "kl": 0.0006722211837768555, "learning_rate": 5.5e-07, "loss": 0.0306, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 2213.544677734375, "completions/mean_terminated_length": 1469.550048828125, "completions/min_length": 348.0, "completions/min_terminated_length": 348.0, "epoch": 0.009897014617630142, "grad_norm": 0.07150210440158844, "kl": 0.0006679296493530273, "learning_rate": 5.6e-07, "loss": 0.0188, "num_tokens": 8246205.0, "reward": 0.212053582072258, "reward_std": 0.20658285915851593, "rewards/accuracy_reward/mean": 0.2120535671710968, "rewards/accuracy_reward/std": 0.40921956300735474, "step": 57 }, { "clip_ratio/high_max": 0.003994007878645789, "clip_ratio/high_mean": 0.0009539035982015776, "clip_ratio/low_mean": 0.0006073251420275483, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015612287561452831, "epoch": 0.010070646453027162, "grad_norm": 0.07255428284406662, "kl": 0.0006822347640991211, "learning_rate": 5.699999999999999e-07, "loss": 0.0189, "step": 58 }, { "clip_ratio/high_max": 0.003949335929064546, "clip_ratio/high_mean": 0.0009523572589387186, "clip_ratio/low_mean": 0.000618500235304964, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015708575083408505, "epoch": 0.010244278288424183, "grad_norm": 0.07023589313030243, "kl": 0.0007022619247436523, "learning_rate": 5.8e-07, "loss": 0.0189, "step": 59 }, { "clip_ratio/high_max": 0.003948498204408679, "clip_ratio/high_mean": 0.000960502989073575, "clip_ratio/low_mean": 0.0006510827518013684, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016115857315526227, "epoch": 0.010417910123821202, "grad_norm": 0.0689178928732872, "kl": 0.0007317066192626953, "learning_rate": 5.9e-07, "loss": 0.0188, "step": 60 }, { "clip_ratio/high_max": 0.0039297049697779585, "clip_ratio/high_mean": 0.0009450304323763703, "clip_ratio/low_mean": 0.0006593866348794108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016044171006797114, "epoch": 0.010591541959218223, "grad_norm": 0.07289843261241913, "kl": 0.0007518529891967773, "learning_rate": 6e-07, "loss": 0.0188, "step": 61 }, { "clip_ratio/high_max": 0.003906666959665017, "clip_ratio/high_mean": 0.0009593330260031507, "clip_ratio/low_mean": 0.0006892785518175515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016486116091982694, "epoch": 0.010765173794615243, "grad_norm": 0.07011529058218002, "kl": 0.0007706880569458008, "learning_rate": 6.1e-07, "loss": 0.0188, "step": 62 }, { "clip_ratio/high_max": 0.004135772913286928, "clip_ratio/high_mean": 0.000979389057647495, "clip_ratio/low_mean": 0.0006986485373090545, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016780376399765373, "epoch": 0.010938805630012262, "grad_norm": 0.06858497858047485, "kl": 0.0007666349411010742, "learning_rate": 6.2e-07, "loss": 0.0187, "step": 63 }, { "clip_ratio/high_max": 0.004077543224411784, "clip_ratio/high_mean": 0.0009741919639054686, "clip_ratio/low_mean": 0.0007616979112299305, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017358898521706578, "epoch": 0.011112437465409283, "grad_norm": 0.06751859188079834, "kl": 0.0008105039596557617, "learning_rate": 6.3e-07, "loss": 0.0187, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4308035714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 2157.6318359375, "completions/mean_terminated_length": 1465.5804443359375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.011286069300806303, "grad_norm": 0.07896920293569565, "kl": 0.000774383544921875, "learning_rate": 6.4e-07, "loss": 0.0289, "num_tokens": 9277688.0, "reward": 0.2566964328289032, "reward_std": 0.2110968977212906, "rewards/accuracy_reward/mean": 0.2566964328289032, "rewards/accuracy_reward/std": 0.4372987747192383, "step": 65 }, { "clip_ratio/high_max": 0.003698516735312296, "clip_ratio/high_mean": 0.0009404981828993186, "clip_ratio/low_mean": 0.0005447040127819491, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014852021995466202, "epoch": 0.011459701136203322, "grad_norm": 0.07904359698295593, "kl": 0.0007940530776977539, "learning_rate": 6.5e-07, "loss": 0.029, "step": 66 }, { "clip_ratio/high_max": 0.003601189795517712, "clip_ratio/high_mean": 0.000930328556933091, "clip_ratio/low_mean": 0.0005315762192594775, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014619047524320194, "epoch": 0.011633332971600343, "grad_norm": 0.07727278769016266, "kl": 0.0008080005645751953, "learning_rate": 6.6e-07, "loss": 0.0289, "step": 67 }, { "clip_ratio/high_max": 0.00394420118391281, "clip_ratio/high_mean": 0.00101125950095593, "clip_ratio/low_mean": 0.0005797040819288668, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015909635840216652, "epoch": 0.011806964806997363, "grad_norm": 0.07600650936365128, "kl": 0.0008560419082641602, "learning_rate": 6.7e-07, "loss": 0.0289, "step": 68 }, { "clip_ratio/high_max": 0.0037723180903412867, "clip_ratio/high_mean": 0.0010164570612687385, "clip_ratio/low_mean": 0.0005849840431437769, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016014411266951356, "epoch": 0.011980596642394384, "grad_norm": 0.07477348297834396, "kl": 0.000873565673828125, "learning_rate": 6.800000000000001e-07, "loss": 0.0289, "step": 69 }, { "clip_ratio/high_max": 0.003572937654098496, "clip_ratio/high_mean": 0.0009383471142427879, "clip_ratio/low_mean": 0.000670964668643137, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001609311770152999, "epoch": 0.012154228477791403, "grad_norm": 0.07432721555233002, "kl": 0.0009011030197143555, "learning_rate": 6.9e-07, "loss": 0.0288, "step": 70 }, { "clip_ratio/high_max": 0.004295875496609369, "clip_ratio/high_mean": 0.0010443117639624688, "clip_ratio/low_mean": 0.0007529170088673709, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001797228773284587, "epoch": 0.012327860313188423, "grad_norm": 0.07267308235168457, "kl": 0.0009461641311645508, "learning_rate": 7e-07, "loss": 0.0287, "step": 71 }, { "clip_ratio/high_max": 0.0045325598439376336, "clip_ratio/high_mean": 0.0010918093475993373, "clip_ratio/low_mean": 0.000795711078808381, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018875204368669074, "epoch": 0.012501492148585444, "grad_norm": 0.07293566316366196, "kl": 0.0009899139404296875, "learning_rate": 7.1e-07, "loss": 0.0287, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3214285714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3041.0, "completions/mean_length": 1900.1741943359375, "completions/mean_terminated_length": 1345.0987548828125, "completions/min_length": 283.0, "completions/min_terminated_length": 283.0, "epoch": 0.012675123983982463, "grad_norm": 0.09180476516485214, "kl": 0.0011156797409057617, "learning_rate": 7.2e-07, "loss": 0.0362, "num_tokens": 10187374.0, "reward": 0.3348214328289032, "reward_std": 0.26257798075675964, "rewards/accuracy_reward/mean": 0.3348214328289032, "rewards/accuracy_reward/std": 0.47245556116104126, "step": 73 }, { "clip_ratio/high_max": 0.004100159891095245, "clip_ratio/high_mean": 0.001193934856019041, "clip_ratio/low_mean": 0.0008296424266518443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020235772944943164, "epoch": 0.012848755819379483, "grad_norm": 0.08982545882463455, "kl": 0.001139998435974121, "learning_rate": 7.3e-07, "loss": 0.0363, "step": 74 }, { "clip_ratio/high_max": 0.00429122797868331, "clip_ratio/high_mean": 0.0012296319218876306, "clip_ratio/low_mean": 0.0008609084529780375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002090540361677995, "epoch": 0.013022387654776504, "grad_norm": 0.08892754465341568, "kl": 0.0012056827545166016, "learning_rate": 7.4e-07, "loss": 0.0362, "step": 75 }, { "clip_ratio/high_max": 0.004158383948379196, "clip_ratio/high_mean": 0.0012127063864681986, "clip_ratio/low_mean": 0.0008803073160379427, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002093013703415636, "epoch": 0.013196019490173523, "grad_norm": 0.08817041665315628, "kl": 0.0012491941452026367, "learning_rate": 7.5e-07, "loss": 0.0361, "step": 76 }, { "clip_ratio/high_max": 0.0048834725967026316, "clip_ratio/high_mean": 0.0013610067599074682, "clip_ratio/low_mean": 0.0009639359541324666, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023249427267728606, "epoch": 0.013369651325570543, "grad_norm": 0.08846459537744522, "kl": 0.0012907981872558594, "learning_rate": 7.599999999999999e-07, "loss": 0.0361, "step": 77 }, { "clip_ratio/high_max": 0.004636912650312297, "clip_ratio/high_mean": 0.001333299720499781, "clip_ratio/low_mean": 0.001034723654811387, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023680233553022845, "epoch": 0.013543283160967564, "grad_norm": 0.08633720129728317, "kl": 0.0013430118560791016, "learning_rate": 7.699999999999999e-07, "loss": 0.036, "step": 78 }, { "clip_ratio/high_max": 0.005123432740219869, "clip_ratio/high_mean": 0.0014745576791028725, "clip_ratio/low_mean": 0.0010868209719774313, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002561378607424558, "epoch": 0.013716914996364583, "grad_norm": 0.08502452075481415, "kl": 0.0014024972915649414, "learning_rate": 7.799999999999999e-07, "loss": 0.0359, "step": 79 }, { "clip_ratio/high_max": 0.005228551359323319, "clip_ratio/high_mean": 0.0014947765648685163, "clip_ratio/low_mean": 0.001276356058951933, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002771132636553375, "epoch": 0.013890546831761604, "grad_norm": 0.08289571851491928, "kl": 0.0014581680297851562, "learning_rate": 7.9e-07, "loss": 0.0357, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4151785714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 2062.716552734375, "completions/mean_terminated_length": 1346.2022705078125, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.014064178667158624, "grad_norm": 0.08741676807403564, "kl": 0.0013511180877685547, "learning_rate": 8e-07, "loss": 0.0281, "num_tokens": 11177223.0, "reward": 0.2611607313156128, "reward_std": 0.2473892718553543, "rewards/accuracy_reward/mean": 0.2611607015132904, "rewards/accuracy_reward/std": 0.43975839018821716, "step": 81 }, { "clip_ratio/high_max": 0.004445369013410527, "clip_ratio/high_mean": 0.0011010038446102044, "clip_ratio/low_mean": 0.0008391869937440788, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019401908557483694, "epoch": 0.014237810502555643, "grad_norm": 0.08471743762493134, "kl": 0.0014029741287231445, "learning_rate": 8.1e-07, "loss": 0.0282, "step": 82 }, { "clip_ratio/high_max": 0.00455673371834564, "clip_ratio/high_mean": 0.0011209725066692044, "clip_ratio/low_mean": 0.0008313582463870262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019523307273630053, "epoch": 0.014411442337952664, "grad_norm": 0.0828128457069397, "kl": 0.001486659049987793, "learning_rate": 8.199999999999999e-07, "loss": 0.0281, "step": 83 }, { "clip_ratio/high_max": 0.004635777028852317, "clip_ratio/high_mean": 0.0011230814977807313, "clip_ratio/low_mean": 0.0009613681111204642, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002084449602079985, "epoch": 0.014585074173349684, "grad_norm": 0.0822959691286087, "kl": 0.0015625953674316406, "learning_rate": 8.299999999999999e-07, "loss": 0.0281, "step": 84 }, { "clip_ratio/high_max": 0.004793585996594629, "clip_ratio/high_mean": 0.00116039141175861, "clip_ratio/low_mean": 0.0010650135006926575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022254048653849168, "epoch": 0.014758706008746703, "grad_norm": 0.07948633283376694, "kl": 0.001682281494140625, "learning_rate": 8.399999999999999e-07, "loss": 0.028, "step": 85 }, { "clip_ratio/high_max": 0.0052007981303177075, "clip_ratio/high_mean": 0.0013298452039407493, "clip_ratio/low_mean": 0.0012049128088165162, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025347579849039903, "epoch": 0.014932337844143724, "grad_norm": 0.07849815487861633, "kl": 0.0017914772033691406, "learning_rate": 8.499999999999999e-07, "loss": 0.0279, "step": 86 }, { "clip_ratio/high_max": 0.005356111340006464, "clip_ratio/high_mean": 0.0013766098512633107, "clip_ratio/low_mean": 0.0012849277834448003, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002661537643689371, "epoch": 0.015105969679540743, "grad_norm": 0.0780460387468338, "kl": 0.0019347667694091797, "learning_rate": 8.599999999999999e-07, "loss": 0.0278, "step": 87 }, { "clip_ratio/high_max": 0.0058417806194484, "clip_ratio/high_mean": 0.001513496399638825, "clip_ratio/low_mean": 0.0014869967963022646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003000493152285344, "epoch": 0.015279601514937765, "grad_norm": 0.07758530974388123, "kl": 0.00205230712890625, "learning_rate": 8.699999999999999e-07, "loss": 0.0277, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3861607142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 2029.602783203125, "completions/mean_terminated_length": 1373.8399658203125, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.015453233350334784, "grad_norm": 0.07400370389223099, "kl": 0.0020072460174560547, "learning_rate": 8.799999999999999e-07, "loss": 0.0205, "num_tokens": 12149189.0, "reward": 0.3236607313156128, "reward_std": 0.2261972427368164, "rewards/accuracy_reward/mean": 0.3236607015132904, "rewards/accuracy_reward/std": 0.46839532256126404, "step": 89 }, { "clip_ratio/high_max": 0.004547907186861266, "clip_ratio/high_mean": 0.0010851063971131225, "clip_ratio/low_mean": 0.0005492405505265197, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016343469551429735, "epoch": 0.015626865185731805, "grad_norm": 0.07304179668426514, "kl": 0.0021042823791503906, "learning_rate": 8.9e-07, "loss": 0.0205, "step": 90 }, { "clip_ratio/high_max": 0.004526825499851839, "clip_ratio/high_mean": 0.0010922338652790131, "clip_ratio/low_mean": 0.0006552598006237531, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001747493689890689, "epoch": 0.015800497021128825, "grad_norm": 0.07167889177799225, "kl": 0.002244234085083008, "learning_rate": 9e-07, "loss": 0.0204, "step": 91 }, { "clip_ratio/high_max": 0.004971718476554088, "clip_ratio/high_mean": 0.0011797365087318212, "clip_ratio/low_mean": 0.0006583871016800913, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018381235477136215, "epoch": 0.015974128856525844, "grad_norm": 0.0698549821972847, "kl": 0.0023250579833984375, "learning_rate": 9.1e-07, "loss": 0.0204, "step": 92 }, { "clip_ratio/high_max": 0.004589101450619637, "clip_ratio/high_mean": 0.0012102787392223036, "clip_ratio/low_mean": 0.0007527421275881352, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001963020854873321, "epoch": 0.016147760691922863, "grad_norm": 0.06992534548044205, "kl": 0.002512216567993164, "learning_rate": 9.2e-07, "loss": 0.0203, "step": 93 }, { "clip_ratio/high_max": 0.004817654439648322, "clip_ratio/high_mean": 0.0012433063967023372, "clip_ratio/low_mean": 0.0008063747998221515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020496812021519872, "epoch": 0.016321392527319883, "grad_norm": 0.06944569200277328, "kl": 0.002605438232421875, "learning_rate": 9.3e-07, "loss": 0.0202, "step": 94 }, { "clip_ratio/high_max": 0.005232935374806402, "clip_ratio/high_mean": 0.001315542600423214, "clip_ratio/low_mean": 0.000917199590958262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022327422002490493, "epoch": 0.016495024362716906, "grad_norm": 0.06952877342700958, "kl": 0.0027480125427246094, "learning_rate": 9.399999999999999e-07, "loss": 0.0201, "step": 95 }, { "clip_ratio/high_max": 0.005548020842979895, "clip_ratio/high_mean": 0.001440592808648944, "clip_ratio/low_mean": 0.0010439954078265146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002484588212610106, "epoch": 0.016668656198113925, "grad_norm": 0.06754601746797562, "kl": 0.002933502197265625, "learning_rate": 9.499999999999999e-07, "loss": 0.02, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3072.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 1938.4912109375, "completions/mean_terminated_length": 1258.3857421875, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.016842288033510944, "grad_norm": 0.09821396321058273, "kl": 0.0030448436737060547, "learning_rate": 9.6e-07, "loss": 0.0219, "num_tokens": 13080409.0, "reward": 0.3683035969734192, "reward_std": 0.2832372784614563, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "step": 97 }, { "clip_ratio/high_max": 0.004665968797780806, "clip_ratio/high_mean": 0.0014500511115329573, "clip_ratio/low_mean": 0.0009530327547508932, "clip_ratio/low_min": 7.871536581660621e-06, "clip_ratio/region_mean": 0.0024030838558246614, "epoch": 0.017015919868907964, "grad_norm": 0.09680072963237762, "kl": 0.003153085708618164, "learning_rate": 9.7e-07, "loss": 0.022, "step": 98 }, { "clip_ratio/high_max": 0.004618203620339045, "clip_ratio/high_mean": 0.0014677258172923757, "clip_ratio/low_mean": 0.0010165532169139624, "clip_ratio/low_min": 1.2770739886036608e-05, "clip_ratio/region_mean": 0.002484279037162196, "epoch": 0.017189551704304983, "grad_norm": 0.09425373375415802, "kl": 0.0034074783325195312, "learning_rate": 9.8e-07, "loss": 0.0218, "step": 99 }, { "clip_ratio/high_max": 0.004920188781397883, "clip_ratio/high_mean": 0.0015765619823469024, "clip_ratio/low_mean": 0.00113775073555189, "clip_ratio/low_min": 3.1486146326642483e-05, "clip_ratio/region_mean": 0.0027143127044837456, "epoch": 0.017363183539702006, "grad_norm": 0.09248290210962296, "kl": 0.0035467147827148438, "learning_rate": 9.9e-07, "loss": 0.0217, "step": 100 }, { "clip_ratio/high_max": 0.005703470120351994, "clip_ratio/high_mean": 0.00166581543180655, "clip_ratio/low_mean": 0.0013430797039291065, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030088951589277713, "epoch": 0.017536815375099025, "grad_norm": 0.08908680826425552, "kl": 0.003830432891845703, "learning_rate": 1e-06, "loss": 0.0214, "step": 101 }, { "clip_ratio/high_max": 0.005863762889930513, "clip_ratio/high_mean": 0.0018024121991402353, "clip_ratio/low_mean": 0.0014265942429574352, "clip_ratio/low_min": 1.2770739886036608e-05, "clip_ratio/region_mean": 0.003229006450055749, "epoch": 0.017710447210496045, "grad_norm": 0.08827590942382812, "kl": 0.004046916961669922, "learning_rate": 1e-06, "loss": 0.0213, "step": 102 }, { "clip_ratio/high_max": 0.006518306319776457, "clip_ratio/high_mean": 0.0019782090557782794, "clip_ratio/low_mean": 0.0016705085811281606, "clip_ratio/low_min": 3.1486146326642483e-05, "clip_ratio/region_mean": 0.0036487176512309816, "epoch": 0.017884079045893064, "grad_norm": 0.08836202323436737, "kl": 0.00446319580078125, "learning_rate": 1e-06, "loss": 0.0211, "step": 103 }, { "clip_ratio/high_max": 0.0069326405209722, "clip_ratio/high_mean": 0.0020749341892951634, "clip_ratio/low_mean": 0.0018673858021429623, "clip_ratio/low_min": 2.3614609744981863e-05, "clip_ratio/region_mean": 0.003942320061469218, "epoch": 0.018057710881290084, "grad_norm": 0.08826611191034317, "kl": 0.00458526611328125, "learning_rate": 1e-06, "loss": 0.0209, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3995535714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 2007.1920166015625, "completions/mean_terminated_length": 1298.639404296875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.018231342716687103, "grad_norm": 0.08393793553113937, "kl": 0.0033822059631347656, "learning_rate": 1e-06, "loss": 0.0231, "num_tokens": 14041983.0, "reward": 0.3058035969734192, "reward_std": 0.23491349816322327, "rewards/accuracy_reward/mean": 0.3058035671710968, "rewards/accuracy_reward/std": 0.461262047290802, "step": 105 }, { "clip_ratio/high_max": 0.003997217983851442, "clip_ratio/high_mean": 0.0009552658002576209, "clip_ratio/low_mean": 0.0008604680806456599, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018157338881792384, "epoch": 0.018404974552084126, "grad_norm": 0.07852616161108017, "kl": 0.0034384727478027344, "learning_rate": 1e-06, "loss": 0.0231, "step": 106 }, { "clip_ratio/high_max": 0.004295800357795088, "clip_ratio/high_mean": 0.0010811391684910632, "clip_ratio/low_mean": 0.000844327561026148, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019254667358836741, "epoch": 0.018578606387481145, "grad_norm": 0.07736704498529434, "kl": 0.003505706787109375, "learning_rate": 1e-06, "loss": 0.0231, "step": 107 }, { "clip_ratio/high_max": 0.004560503737593535, "clip_ratio/high_mean": 0.0011590307299229607, "clip_ratio/low_mean": 0.0009559186596561631, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002114949373208219, "epoch": 0.018752238222878165, "grad_norm": 0.0765499398112297, "kl": 0.003620624542236328, "learning_rate": 1e-06, "loss": 0.023, "step": 108 }, { "clip_ratio/high_max": 0.005253656318018329, "clip_ratio/high_mean": 0.0012730001919862843, "clip_ratio/low_mean": 0.0011086120084655704, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023816121802155976, "epoch": 0.018925870058275184, "grad_norm": 0.07404652237892151, "kl": 0.0037598609924316406, "learning_rate": 1e-06, "loss": 0.0228, "step": 109 }, { "clip_ratio/high_max": 0.005704363389668288, "clip_ratio/high_mean": 0.0013903482999921835, "clip_ratio/low_mean": 0.001211059175147966, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026014074855993385, "epoch": 0.019099501893672204, "grad_norm": 0.07342441380023956, "kl": 0.003848552703857422, "learning_rate": 1e-06, "loss": 0.0227, "step": 110 }, { "clip_ratio/high_max": 0.00636932346606045, "clip_ratio/high_mean": 0.0015115973997126275, "clip_ratio/low_mean": 0.0013224545064076665, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028340519029370626, "epoch": 0.019273133729069226, "grad_norm": 0.07353564351797104, "kl": 0.003985881805419922, "learning_rate": 1e-06, "loss": 0.0226, "step": 111 }, { "clip_ratio/high_max": 0.007524131955506164, "clip_ratio/high_mean": 0.0017368733458624774, "clip_ratio/low_mean": 0.001557835636958771, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003294709010333463, "epoch": 0.019446765564466246, "grad_norm": 0.06943716108798981, "kl": 0.004157543182373047, "learning_rate": 1e-06, "loss": 0.0224, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3348214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 1847.91748046875, "completions/mean_terminated_length": 1231.7684326171875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.019620397399863265, "grad_norm": 0.08264736086130142, "kl": 0.004137516021728516, "learning_rate": 1e-06, "loss": 0.0081, "num_tokens": 14938050.0, "reward": 0.3370535969734192, "reward_std": 0.21124520897865295, "rewards/accuracy_reward/mean": 0.3370535671710968, "rewards/accuracy_reward/std": 0.47323182225227356, "step": 113 }, { "clip_ratio/high_max": 0.0039032721124385716, "clip_ratio/high_mean": 0.0009903121056140662, "clip_ratio/low_mean": 0.0006034856987753301, "clip_ratio/low_min": 2.0083547497051768e-05, "clip_ratio/region_mean": 0.001593797801888286, "epoch": 0.019794029235260285, "grad_norm": 0.07990849763154984, "kl": 0.004215240478515625, "learning_rate": 1e-06, "loss": 0.0081, "step": 114 }, { "clip_ratio/high_max": 0.004641458384867292, "clip_ratio/high_mean": 0.0010962223636852286, "clip_ratio/low_mean": 0.0006119532440607145, "clip_ratio/low_min": 2.0083547497051768e-05, "clip_ratio/region_mean": 0.0017081756163861428, "epoch": 0.019967661070657304, "grad_norm": 0.07955808192491531, "kl": 0.0044002532958984375, "learning_rate": 1e-06, "loss": 0.008, "step": 115 }, { "clip_ratio/high_max": 0.004929477707264596, "clip_ratio/high_mean": 0.001220468486963, "clip_ratio/low_mean": 0.0007192735071157585, "clip_ratio/low_min": 3.735357313416898e-05, "clip_ratio/region_mean": 0.001939742002832645, "epoch": 0.020141292906054323, "grad_norm": 0.07885877043008804, "kl": 0.004464149475097656, "learning_rate": 1e-06, "loss": 0.0078, "step": 116 }, { "clip_ratio/high_max": 0.005431932877399959, "clip_ratio/high_mean": 0.0013107300405863498, "clip_ratio/low_mean": 0.0008663442935130661, "clip_ratio/low_min": 2.988285996252671e-05, "clip_ratio/region_mean": 0.0021770743005617987, "epoch": 0.020314924741451346, "grad_norm": 0.07767607271671295, "kl": 0.004650115966796875, "learning_rate": 1e-06, "loss": 0.0077, "step": 117 }, { "clip_ratio/high_max": 0.006048259116141708, "clip_ratio/high_mean": 0.0014546032282396482, "clip_ratio/low_mean": 0.0009584152332990925, "clip_ratio/low_min": 4.482428994379006e-05, "clip_ratio/region_mean": 0.0024130184574460145, "epoch": 0.020488556576848366, "grad_norm": 0.07633480429649353, "kl": 0.004792690277099609, "learning_rate": 1e-06, "loss": 0.0075, "step": 118 }, { "clip_ratio/high_max": 0.006363012729707407, "clip_ratio/high_mean": 0.0015345831791364617, "clip_ratio/low_mean": 0.0011184974814568704, "clip_ratio/low_min": 1.0041773748525884e-05, "clip_ratio/region_mean": 0.002653080696290999, "epoch": 0.020662188412245385, "grad_norm": 0.07476557046175003, "kl": 0.004907131195068359, "learning_rate": 1e-06, "loss": 0.0073, "step": 119 }, { "clip_ratio/high_max": 0.007921144890133291, "clip_ratio/high_mean": 0.0018484887791601068, "clip_ratio/low_mean": 0.001251010267651509, "clip_ratio/low_min": 2.0083547497051768e-05, "clip_ratio/region_mean": 0.0030994990383987897, "epoch": 0.020835820247642405, "grad_norm": 0.07136659324169159, "kl": 0.005038261413574219, "learning_rate": 1e-06, "loss": 0.0071, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3794642857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 1930.0960693359375, "completions/mean_terminated_length": 1231.809326171875, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.021009452083039424, "grad_norm": 0.08095911890268326, "kl": 0.005431175231933594, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 15865869.0, "reward": 0.408482164144516, "reward_std": 0.22500260174274445, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "step": 121 }, { "clip_ratio/high_max": 0.004851713018069859, "clip_ratio/high_mean": 0.001360524291612819, "clip_ratio/low_mean": 0.0006360148054227466, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019965391074947547, "epoch": 0.021183083918436447, "grad_norm": 0.07773750275373459, "kl": 0.005425453186035156, "learning_rate": 1e-06, "loss": 0.0248, "step": 122 }, { "clip_ratio/high_max": 0.00540244370495202, "clip_ratio/high_mean": 0.0014174081027249485, "clip_ratio/low_mean": 0.0006910337231147423, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021084418449390796, "epoch": 0.021356715753833466, "grad_norm": 0.07707370072603226, "kl": 0.005459308624267578, "learning_rate": 1e-06, "loss": 0.0248, "step": 123 }, { "clip_ratio/high_max": 0.005811562477902044, "clip_ratio/high_mean": 0.0015023706550891802, "clip_ratio/low_mean": 0.0007706979731665342, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022730686105205677, "epoch": 0.021530347589230486, "grad_norm": 0.0752694383263588, "kl": 0.005592823028564453, "learning_rate": 1e-06, "loss": 0.0246, "step": 124 }, { "clip_ratio/high_max": 0.005960488575510681, "clip_ratio/high_mean": 0.0015578029219796008, "clip_ratio/low_mean": 0.0007921496907101755, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002349952632357599, "epoch": 0.021703979424627505, "grad_norm": 0.07837209850549698, "kl": 0.005625724792480469, "learning_rate": 1e-06, "loss": 0.0245, "step": 125 }, { "clip_ratio/high_max": 0.006398163191988715, "clip_ratio/high_mean": 0.0016745692473705276, "clip_ratio/low_mean": 0.0009608030268282164, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002635372289660154, "epoch": 0.021877611260024524, "grad_norm": 0.07582316547632217, "kl": 0.005856990814208984, "learning_rate": 1e-06, "loss": 0.0244, "step": 126 }, { "clip_ratio/high_max": 0.006704689539219544, "clip_ratio/high_mean": 0.0018308112598788284, "clip_ratio/low_mean": 0.0010544128599576652, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028852240948253893, "epoch": 0.022051243095421544, "grad_norm": 0.06979215890169144, "kl": 0.006018161773681641, "learning_rate": 1e-06, "loss": 0.0242, "step": 127 }, { "clip_ratio/high_max": 0.007274195504578529, "clip_ratio/high_mean": 0.0019609469261467893, "clip_ratio/low_mean": 0.0012697803585979273, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032307273177138995, "epoch": 0.022224874930818567, "grad_norm": 0.0688919946551323, "kl": 0.006125450134277344, "learning_rate": 1e-06, "loss": 0.024, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4084821428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 2035.0648193359375, "completions/mean_terminated_length": 1318.992431640625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.022398506766215586, "grad_norm": 0.08655736595392227, "kl": 0.005330562591552734, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 16846018.0, "reward": 0.3035714328289032, "reward_std": 0.254311740398407, "rewards/accuracy_reward/mean": 0.3035714328289032, "rewards/accuracy_reward/std": 0.46031373739242554, "step": 129 }, { "clip_ratio/high_max": 0.0048619600165693555, "clip_ratio/high_mean": 0.001311855442054366, "clip_ratio/low_mean": 0.000777202934386878, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020890583746222546, "epoch": 0.022572138601612605, "grad_norm": 0.08760856091976166, "kl": 0.005454063415527344, "learning_rate": 1e-06, "loss": 0.0205, "step": 130 }, { "clip_ratio/high_max": 0.00501359148620395, "clip_ratio/high_mean": 0.001377075447635434, "clip_ratio/low_mean": 0.0009486862600169843, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023257617322087754, "epoch": 0.022745770437009625, "grad_norm": 0.0840456485748291, "kl": 0.005469799041748047, "learning_rate": 1e-06, "loss": 0.0204, "step": 131 }, { "clip_ratio/high_max": 0.005394582782173529, "clip_ratio/high_mean": 0.0014868567363919283, "clip_ratio/low_mean": 0.001004408745757246, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024912654916988686, "epoch": 0.022919402272406644, "grad_norm": 0.08194149285554886, "kl": 0.005599021911621094, "learning_rate": 1e-06, "loss": 0.0202, "step": 132 }, { "clip_ratio/high_max": 0.005871234818187077, "clip_ratio/high_mean": 0.0016160059089997958, "clip_ratio/low_mean": 0.0011159082923768437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027319141663610935, "epoch": 0.023093034107803667, "grad_norm": 0.0789162665605545, "kl": 0.005701541900634766, "learning_rate": 1e-06, "loss": 0.02, "step": 133 }, { "clip_ratio/high_max": 0.006407690663763788, "clip_ratio/high_mean": 0.0017475099020884954, "clip_ratio/low_mean": 0.0013060904466328793, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030536003469023854, "epoch": 0.023266665943200687, "grad_norm": 0.07720374315977097, "kl": 0.005742549896240234, "learning_rate": 1e-06, "loss": 0.0198, "step": 134 }, { "clip_ratio/high_max": 0.0070254678175842855, "clip_ratio/high_mean": 0.001874488430985366, "clip_ratio/low_mean": 0.001571705483002006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034461938666936476, "epoch": 0.023440297778597706, "grad_norm": 0.07577391713857651, "kl": 0.005927562713623047, "learning_rate": 1e-06, "loss": 0.0197, "step": 135 }, { "clip_ratio/high_max": 0.007858596782170935, "clip_ratio/high_mean": 0.0020633719018405827, "clip_ratio/low_mean": 0.001914332018714049, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003977703945565736, "epoch": 0.023613929613994725, "grad_norm": 0.07286782562732697, "kl": 0.0061016082763671875, "learning_rate": 1e-06, "loss": 0.0193, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3072.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 1933.587158203125, "completions/mean_terminated_length": 1250.539306640625, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.023787561449391745, "grad_norm": 0.07663754373788834, "kl": 0.00577545166015625, "learning_rate": 1e-06, "loss": 0.0313, "num_tokens": 17769721.0, "reward": 0.4218750298023224, "reward_std": 0.23372167348861694, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "step": 137 }, { "clip_ratio/high_max": 0.004371647075458895, "clip_ratio/high_mean": 0.0011583919213080662, "clip_ratio/low_mean": 0.000503549390941771, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016619413072476164, "epoch": 0.023961193284788768, "grad_norm": 0.07479061186313629, "kl": 0.005824089050292969, "learning_rate": 1e-06, "loss": 0.0314, "step": 138 }, { "clip_ratio/high_max": 0.004285951017664047, "clip_ratio/high_mean": 0.001203526346671424, "clip_ratio/low_mean": 0.0005746116535192414, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001778137986548245, "epoch": 0.024134825120185787, "grad_norm": 0.07414408028125763, "kl": 0.0059680938720703125, "learning_rate": 1e-06, "loss": 0.0313, "step": 139 }, { "clip_ratio/high_max": 0.004300973392673768, "clip_ratio/high_mean": 0.0012260285429874784, "clip_ratio/low_mean": 0.0005980913188068371, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001824119874072494, "epoch": 0.024308456955582806, "grad_norm": 0.07098712772130966, "kl": 0.006072998046875, "learning_rate": 1e-06, "loss": 0.0312, "step": 140 }, { "clip_ratio/high_max": 0.0050545641424832866, "clip_ratio/high_mean": 0.0013654219947056845, "clip_ratio/low_mean": 0.0007108074360075989, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002076229456179135, "epoch": 0.024482088790979826, "grad_norm": 0.06988701969385147, "kl": 0.006176948547363281, "learning_rate": 1e-06, "loss": 0.031, "step": 141 }, { "clip_ratio/high_max": 0.0053927825174469035, "clip_ratio/high_mean": 0.001447280352749658, "clip_ratio/low_mean": 0.0008586188146182394, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002305899150087498, "epoch": 0.024655720626376845, "grad_norm": 0.0681399255990982, "kl": 0.006438255310058594, "learning_rate": 1e-06, "loss": 0.0308, "step": 142 }, { "clip_ratio/high_max": 0.00592138001411513, "clip_ratio/high_mean": 0.00160317979998581, "clip_ratio/low_mean": 0.0010102350497618318, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002613414872030262, "epoch": 0.024829352461773865, "grad_norm": 0.06519921123981476, "kl": 0.006699562072753906, "learning_rate": 1e-06, "loss": 0.0307, "step": 143 }, { "clip_ratio/high_max": 0.0066412329688319005, "clip_ratio/high_mean": 0.0017882650722640392, "clip_ratio/low_mean": 0.0011500060691105318, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002938271154562244, "epoch": 0.025002984297170888, "grad_norm": 0.06443555653095245, "kl": 0.0068721771240234375, "learning_rate": 1e-06, "loss": 0.0305, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3727678571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 1979.2679443359375, "completions/mean_terminated_length": 1329.8504638671875, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.025176616132567907, "grad_norm": 0.06663346290588379, "kl": 0.006520271301269531, "learning_rate": 1e-06, "loss": 0.0205, "num_tokens": 18723113.0, "reward": 0.28125, "reward_std": 0.16375385224819183, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45011183619499207, "step": 145 }, { "clip_ratio/high_max": 0.0030207013005565386, "clip_ratio/high_mean": 0.0007204095982160652, "clip_ratio/low_mean": 0.0005041334976567668, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012245431003066187, "epoch": 0.025350247967964926, "grad_norm": 0.07924777269363403, "kl": 0.0066318511962890625, "learning_rate": 1e-06, "loss": 0.0206, "step": 146 }, { "clip_ratio/high_max": 0.003344574844959425, "clip_ratio/high_mean": 0.0007526034414695459, "clip_ratio/low_mean": 0.0006024809365499095, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0013550843691518821, "epoch": 0.025523879803361946, "grad_norm": 0.06370459496974945, "kl": 0.0067901611328125, "learning_rate": 1e-06, "loss": 0.0205, "step": 147 }, { "clip_ratio/high_max": 0.0034553498517198022, "clip_ratio/high_mean": 0.0007513530463256757, "clip_ratio/low_mean": 0.0006781640753388274, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014295171222329373, "epoch": 0.025697511638758965, "grad_norm": 0.0663418099284172, "kl": 0.006916046142578125, "learning_rate": 1e-06, "loss": 0.0204, "step": 148 }, { "clip_ratio/high_max": 0.003796188194428396, "clip_ratio/high_mean": 0.0008537604383036523, "clip_ratio/low_mean": 0.000808169459560304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001661929883994162, "epoch": 0.025871143474155988, "grad_norm": 0.07972723990678787, "kl": 0.007083892822265625, "learning_rate": 1e-06, "loss": 0.0203, "step": 149 }, { "clip_ratio/high_max": 0.004629340441169916, "clip_ratio/high_mean": 0.000985632138508663, "clip_ratio/low_mean": 0.0009128691410751344, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018985012493430986, "epoch": 0.026044775309553007, "grad_norm": 0.06047123670578003, "kl": 0.007245063781738281, "learning_rate": 1e-06, "loss": 0.0202, "step": 150 }, { "clip_ratio/high_max": 0.004806700231711147, "clip_ratio/high_mean": 0.0010732566029219015, "clip_ratio/low_mean": 0.001058938100186424, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021321946969692362, "epoch": 0.026218407144950027, "grad_norm": 0.05877959355711937, "kl": 0.007388114929199219, "learning_rate": 1e-06, "loss": 0.02, "step": 151 }, { "clip_ratio/high_max": 0.005967060122202383, "clip_ratio/high_mean": 0.0012560106470118626, "clip_ratio/low_mean": 0.0012162447937953402, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002472255446264171, "epoch": 0.026392038980347046, "grad_norm": 0.058075983077287674, "kl": 0.007506370544433594, "learning_rate": 1e-06, "loss": 0.0199, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4017857142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 2017.2813720703125, "completions/mean_terminated_length": 1308.8880615234375, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.026565670815744066, "grad_norm": 0.06706869602203369, "kl": 0.0069751739501953125, "learning_rate": 1e-06, "loss": 0.0163, "num_tokens": 19690767.0, "reward": 0.3415178656578064, "reward_std": 0.18599644303321838, "rewards/accuracy_reward/mean": 0.3415178656578064, "rewards/accuracy_reward/std": 0.4747488796710968, "step": 153 }, { "clip_ratio/high_max": 0.004434643993590726, "clip_ratio/high_mean": 0.0009370632978971116, "clip_ratio/low_mean": 0.0006492042020909139, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015862674881645944, "epoch": 0.026739302651141085, "grad_norm": 0.06352043896913528, "kl": 0.0070743560791015625, "learning_rate": 1e-06, "loss": 0.0163, "step": 154 }, { "clip_ratio/high_max": 0.004057594829646405, "clip_ratio/high_mean": 0.0008946688044488837, "clip_ratio/low_mean": 0.0006769216606699047, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015715904628450517, "epoch": 0.026912934486538108, "grad_norm": 0.0632409006357193, "kl": 0.007099151611328125, "learning_rate": 1e-06, "loss": 0.0162, "step": 155 }, { "clip_ratio/high_max": 0.0050138172300648876, "clip_ratio/high_mean": 0.001070659096512827, "clip_ratio/low_mean": 0.0007576853683985973, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001828344471505261, "epoch": 0.027086566321935127, "grad_norm": 0.06357962638139725, "kl": 0.00717926025390625, "learning_rate": 1e-06, "loss": 0.0161, "step": 156 }, { "clip_ratio/high_max": 0.00528864227453596, "clip_ratio/high_mean": 0.0011208452256141754, "clip_ratio/low_mean": 0.0007996724248187093, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019205176413379377, "epoch": 0.027260198157332147, "grad_norm": 0.06167202070355415, "kl": 0.007285118103027344, "learning_rate": 1e-06, "loss": 0.016, "step": 157 }, { "clip_ratio/high_max": 0.006105554861278506, "clip_ratio/high_mean": 0.0012601537894170178, "clip_ratio/low_mean": 0.0009107661221605667, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021709198990720324, "epoch": 0.027433829992729166, "grad_norm": 0.06077278032898903, "kl": 0.00736236572265625, "learning_rate": 1e-06, "loss": 0.0159, "step": 158 }, { "clip_ratio/high_max": 0.006380973354680464, "clip_ratio/high_mean": 0.0012938425334141357, "clip_ratio/low_mean": 0.001056705562859861, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002350548056710977, "epoch": 0.027607461828126185, "grad_norm": 0.05914130061864853, "kl": 0.007594108581542969, "learning_rate": 1e-06, "loss": 0.0157, "step": 159 }, { "clip_ratio/high_max": 0.007571938382170629, "clip_ratio/high_mean": 0.0015345798628914054, "clip_ratio/low_mean": 0.0012650793487409828, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027996592471026815, "epoch": 0.02778109366352321, "grad_norm": 0.056562379002571106, "kl": 0.00772857666015625, "learning_rate": 1e-06, "loss": 0.0155, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3839285714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 2057.390625, "completions/mean_terminated_length": 1425.097900390625, "completions/min_length": 315.0, "completions/min_terminated_length": 315.0, "epoch": 0.027954725498920228, "grad_norm": 0.07040048390626907, "kl": 0.0069732666015625, "learning_rate": 1e-06, "loss": 0.014, "num_tokens": 20680598.0, "reward": 0.3437500298023224, "reward_std": 0.19478052854537964, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47548985481262207, "step": 161 }, { "clip_ratio/high_max": 0.0037697603529522894, "clip_ratio/high_mean": 0.000979594557747987, "clip_ratio/low_mean": 0.0005216064530486619, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001501201031715027, "epoch": 0.028128357334317247, "grad_norm": 0.0673152357339859, "kl": 0.007232666015625, "learning_rate": 1e-06, "loss": 0.014, "step": 162 }, { "clip_ratio/high_max": 0.004402585413117777, "clip_ratio/high_mean": 0.0010781300484268286, "clip_ratio/low_mean": 0.0005549893892293767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016331194347003475, "epoch": 0.028301989169714267, "grad_norm": 0.06635653972625732, "kl": 0.007292747497558594, "learning_rate": 1e-06, "loss": 0.014, "step": 163 }, { "clip_ratio/high_max": 0.004397464061185019, "clip_ratio/high_mean": 0.0011495708080246914, "clip_ratio/low_mean": 0.0006219515505563322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017715223539198632, "epoch": 0.028475621005111286, "grad_norm": 0.06589936465024948, "kl": 0.00749969482421875, "learning_rate": 1e-06, "loss": 0.0138, "step": 164 }, { "clip_ratio/high_max": 0.004474310795558267, "clip_ratio/high_mean": 0.0011851532217406202, "clip_ratio/low_mean": 0.0008197030983865261, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002004856330131588, "epoch": 0.02864925284050831, "grad_norm": 0.06395601481199265, "kl": 0.007836341857910156, "learning_rate": 1e-06, "loss": 0.0136, "step": 165 }, { "clip_ratio/high_max": 0.005478622988448478, "clip_ratio/high_mean": 0.001394131928918796, "clip_ratio/low_mean": 0.0009513227573734184, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023454546926586772, "epoch": 0.028822884675905328, "grad_norm": 0.06276561319828033, "kl": 0.007869720458984375, "learning_rate": 1e-06, "loss": 0.0135, "step": 166 }, { "clip_ratio/high_max": 0.0060274213919910835, "clip_ratio/high_mean": 0.0014766716274152714, "clip_ratio/low_mean": 0.0010191873125222628, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024958589528978337, "epoch": 0.028996516511302348, "grad_norm": 0.06163609400391579, "kl": 0.008042335510253906, "learning_rate": 1e-06, "loss": 0.0133, "step": 167 }, { "clip_ratio/high_max": 0.007194999201601604, "clip_ratio/high_mean": 0.0017534274456920684, "clip_ratio/low_mean": 0.0012308122977628955, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029842397289030487, "epoch": 0.029170148346699367, "grad_norm": 0.05956633388996124, "kl": 0.008493423461914062, "learning_rate": 1e-06, "loss": 0.0132, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3459821428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 1939.35498046875, "completions/mean_terminated_length": 1340.174072265625, "completions/min_length": 40.0, "completions/min_terminated_length": 40.0, "epoch": 0.029343780182096386, "grad_norm": 0.07166067510843277, "kl": 0.011423110961914062, "learning_rate": 1e-06, "loss": 0.0255, "num_tokens": 21606565.0, "reward": 0.3906250298023224, "reward_std": 0.21243984997272491, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "step": 169 }, { "clip_ratio/high_max": 0.004154438352998113, "clip_ratio/high_mean": 0.0009966524285118794, "clip_ratio/low_mean": 0.0005928743455569929, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015895267624728149, "epoch": 0.029517412017493406, "grad_norm": 0.07069960981607437, "kl": 0.010828971862792969, "learning_rate": 1e-06, "loss": 0.0256, "step": 170 }, { "clip_ratio/high_max": 0.003984667302574962, "clip_ratio/high_mean": 0.0009817298105190275, "clip_ratio/low_mean": 0.0007308181493499433, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001712547989882296, "epoch": 0.02969104385289043, "grad_norm": 0.06895480304956436, "kl": 0.011860847473144531, "learning_rate": 1e-06, "loss": 0.0255, "step": 171 }, { "clip_ratio/high_max": 0.004675776628573658, "clip_ratio/high_mean": 0.0011062003482038563, "clip_ratio/low_mean": 0.0007853060903926234, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018915064129032544, "epoch": 0.029864675688287448, "grad_norm": 0.06729733198881149, "kl": 0.012163162231445312, "learning_rate": 1e-06, "loss": 0.0254, "step": 172 }, { "clip_ratio/high_max": 0.0052451919837039895, "clip_ratio/high_mean": 0.0012543406110125943, "clip_ratio/low_mean": 0.0008990823475869547, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021534229217650136, "epoch": 0.030038307523684468, "grad_norm": 0.06534060090780258, "kl": 0.012143135070800781, "learning_rate": 1e-06, "loss": 0.0252, "step": 173 }, { "clip_ratio/high_max": 0.005410314337495947, "clip_ratio/high_mean": 0.0013065263460703136, "clip_ratio/low_mean": 0.0010690507097024238, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002375577109887672, "epoch": 0.030211939359081487, "grad_norm": 0.0643325001001358, "kl": 0.012264251708984375, "learning_rate": 1e-06, "loss": 0.0251, "step": 174 }, { "clip_ratio/high_max": 0.006287320808041841, "clip_ratio/high_mean": 0.0014851219229967683, "clip_ratio/low_mean": 0.0013329398552741623, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002818061810103245, "epoch": 0.030385571194478506, "grad_norm": 0.06185663118958473, "kl": 0.012700080871582031, "learning_rate": 1e-06, "loss": 0.0249, "step": 175 }, { "clip_ratio/high_max": 0.006790737199480645, "clip_ratio/high_mean": 0.0015923003993520979, "clip_ratio/low_mean": 0.001576155286784342, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031684556133768638, "epoch": 0.03055920302987553, "grad_norm": 0.06139228120446205, "kl": 0.013003349304199219, "learning_rate": 1e-06, "loss": 0.0247, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3928571428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 1975.2210693359375, "completions/mean_terminated_length": 1265.5404052734375, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.03073283486527255, "grad_norm": 0.06676605343818665, "kl": 0.009716987609863281, "learning_rate": 1e-06, "loss": 0.0418, "num_tokens": 22562152.0, "reward": 0.3549107313156128, "reward_std": 0.177725151181221, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211319923401, "step": 177 }, { "clip_ratio/high_max": 0.004548877608613111, "clip_ratio/high_mean": 0.0008783262214819842, "clip_ratio/low_mean": 0.000663214160795178, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001541540401376551, "epoch": 0.030906466700669568, "grad_norm": 0.06532199680805206, "kl": 0.00983428955078125, "learning_rate": 1e-06, "loss": 0.0418, "step": 178 }, { "clip_ratio/high_max": 0.0045628743810084416, "clip_ratio/high_mean": 0.0008866641655913554, "clip_ratio/low_mean": 0.0007016445351837319, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015883087053225609, "epoch": 0.031080098536066587, "grad_norm": 0.06407991051673889, "kl": 0.010072708129882812, "learning_rate": 1e-06, "loss": 0.0417, "step": 179 }, { "clip_ratio/high_max": 0.004998375647119246, "clip_ratio/high_mean": 0.0009823043696997047, "clip_ratio/low_mean": 0.0007823778678357485, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001764682230714243, "epoch": 0.03125373037146361, "grad_norm": 0.06273090094327927, "kl": 0.010199546813964844, "learning_rate": 1e-06, "loss": 0.0416, "step": 180 }, { "clip_ratio/high_max": 0.005659303313223063, "clip_ratio/high_mean": 0.0010998464335898461, "clip_ratio/low_mean": 0.0009710038584671565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020708502852357924, "epoch": 0.031427362206860626, "grad_norm": 0.06945890933275223, "kl": 0.010499000549316406, "learning_rate": 1e-06, "loss": 0.0415, "step": 181 }, { "clip_ratio/high_max": 0.006450060100178234, "clip_ratio/high_mean": 0.0012763663307850948, "clip_ratio/low_mean": 0.0009811855325096985, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002257551879665698, "epoch": 0.03160099404225765, "grad_norm": 0.06256972998380661, "kl": 0.010334014892578125, "learning_rate": 1e-06, "loss": 0.0413, "step": 182 }, { "clip_ratio/high_max": 0.007272931428815355, "clip_ratio/high_mean": 0.0014719640876137419, "clip_ratio/low_mean": 0.0012525945339803002, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027245586024946533, "epoch": 0.031774625877654665, "grad_norm": 0.059706758707761765, "kl": 0.010437965393066406, "learning_rate": 1e-06, "loss": 0.0412, "step": 183 }, { "clip_ratio/high_max": 0.008268690766271902, "clip_ratio/high_mean": 0.0016652210124448175, "clip_ratio/low_mean": 0.0013867361976735992, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030519571992044803, "epoch": 0.03194825771305169, "grad_norm": 0.05767683684825897, "kl": 0.010613441467285156, "learning_rate": 1e-06, "loss": 0.041, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3660714285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 1952.2523193359375, "completions/mean_terminated_length": 1305.6373291015625, "completions/min_length": 422.0, "completions/min_terminated_length": 422.0, "epoch": 0.03212188954844871, "grad_norm": 0.07633760571479797, "kl": 0.011765480041503906, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 23497825.0, "reward": 0.3705357313156128, "reward_std": 0.21936172246932983, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.48348814249038696, "step": 185 }, { "clip_ratio/high_max": 0.0038792926279711537, "clip_ratio/high_mean": 0.0008722499837858777, "clip_ratio/low_mean": 0.0008423511903856706, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017146011678050854, "epoch": 0.03229552138384573, "grad_norm": 0.07396596670150757, "kl": 0.011639595031738281, "learning_rate": 1e-06, "loss": 0.024, "step": 186 }, { "clip_ratio/high_max": 0.004016244240119704, "clip_ratio/high_mean": 0.0009196023208914994, "clip_ratio/low_mean": 0.00090161151592838, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018212138256785693, "epoch": 0.03246915321924275, "grad_norm": 0.07162409275770187, "kl": 0.012526512145996094, "learning_rate": 1e-06, "loss": 0.0238, "step": 187 }, { "clip_ratio/high_max": 0.004857996631471906, "clip_ratio/high_mean": 0.0011080410940849106, "clip_ratio/low_mean": 0.0010575318001428968, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021655728760379134, "epoch": 0.032642785054639765, "grad_norm": 0.0715244859457016, "kl": 0.011851310729980469, "learning_rate": 1e-06, "loss": 0.0237, "step": 188 }, { "clip_ratio/high_max": 0.0050666102215473074, "clip_ratio/high_mean": 0.001124887626247073, "clip_ratio/low_mean": 0.001253108634500677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023779962612024974, "epoch": 0.03281641689003679, "grad_norm": 0.06715864688158035, "kl": 0.012170791625976562, "learning_rate": 1e-06, "loss": 0.0235, "step": 189 }, { "clip_ratio/high_max": 0.005782000062026782, "clip_ratio/high_mean": 0.0013126272124281968, "clip_ratio/low_mean": 0.0014697350989081315, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002782362300422392, "epoch": 0.03299004872543381, "grad_norm": 0.0644487664103508, "kl": 0.012758255004882812, "learning_rate": 1e-06, "loss": 0.0233, "step": 190 }, { "clip_ratio/high_max": 0.006263343962928047, "clip_ratio/high_mean": 0.0013922708549216622, "clip_ratio/low_mean": 0.0016894532082005753, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030817239876341773, "epoch": 0.03316368056083083, "grad_norm": 0.06378155946731567, "kl": 0.012617111206054688, "learning_rate": 1e-06, "loss": 0.0232, "step": 191 }, { "clip_ratio/high_max": 0.0076059747516410425, "clip_ratio/high_mean": 0.0016639072482576012, "clip_ratio/low_mean": 0.001954144422597892, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003618051647208631, "epoch": 0.03333731239622785, "grad_norm": 0.06046665459871292, "kl": 0.012700080871582031, "learning_rate": 1e-06, "loss": 0.0229, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 1812.82373046875, "completions/mean_terminated_length": 1240.4708251953125, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "epoch": 0.033510944231624866, "grad_norm": 0.0884440466761589, "kl": 0.012012481689453125, "learning_rate": 1e-06, "loss": 0.0221, "num_tokens": 24366090.0, "reward": 0.4575892984867096, "reward_std": 0.22057272493839264, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "step": 193 }, { "clip_ratio/high_max": 0.004949147507431917, "clip_ratio/high_mean": 0.0012433102515387873, "clip_ratio/low_mean": 0.0006688782805213123, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019121885698041297, "epoch": 0.03368457606702189, "grad_norm": 0.08634856343269348, "kl": 0.012041091918945312, "learning_rate": 1e-06, "loss": 0.0221, "step": 194 }, { "clip_ratio/high_max": 0.005268261706078192, "clip_ratio/high_mean": 0.0013053044835942273, "clip_ratio/low_mean": 0.0008330149846642598, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021383194625741453, "epoch": 0.03385820790241891, "grad_norm": 0.08262626081705093, "kl": 0.012241363525390625, "learning_rate": 1e-06, "loss": 0.0219, "step": 195 }, { "clip_ratio/high_max": 0.005148787411599187, "clip_ratio/high_mean": 0.0012898793356725946, "clip_ratio/low_mean": 0.0009201424438742833, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022100218211562606, "epoch": 0.03403183973781593, "grad_norm": 0.08106177300214767, "kl": 0.012590408325195312, "learning_rate": 1e-06, "loss": 0.0218, "step": 196 }, { "clip_ratio/high_max": 0.006804738986829761, "clip_ratio/high_mean": 0.0016117114610096905, "clip_ratio/low_mean": 0.0012787132136509172, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028904247174068587, "epoch": 0.03420547157321295, "grad_norm": 0.07583323866128922, "kl": 0.013156890869140625, "learning_rate": 1e-06, "loss": 0.0216, "step": 197 }, { "clip_ratio/high_max": 0.007689140489674173, "clip_ratio/high_mean": 0.0017711525524646277, "clip_ratio/low_mean": 0.0015852261722102412, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003356378691023565, "epoch": 0.034379103408609966, "grad_norm": 0.0727124810218811, "kl": 0.013492584228515625, "learning_rate": 1e-06, "loss": 0.0213, "step": 198 }, { "clip_ratio/high_max": 0.008880560413672356, "clip_ratio/high_mean": 0.00200296037382941, "clip_ratio/low_mean": 0.0017906150546878052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037935754226054996, "epoch": 0.03455273524400699, "grad_norm": 0.0711628720164299, "kl": 0.013805389404296875, "learning_rate": 1e-06, "loss": 0.0211, "step": 199 }, { "clip_ratio/high_max": 0.009983885793189984, "clip_ratio/high_mean": 0.0023307662586375955, "clip_ratio/low_mean": 0.002078321793305804, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004409088100146619, "epoch": 0.03472636707940401, "grad_norm": 0.06648662686347961, "kl": 0.013883590698242188, "learning_rate": 1e-06, "loss": 0.0208, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2410714285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 1663.0067138671875, "completions/mean_terminated_length": 1215.444091796875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.03489999891480103, "grad_norm": 0.10324262827634811, "kl": 0.013607025146484375, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 25166805.0, "reward": 0.4263392984867096, "reward_std": 0.2893931567668915, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "step": 201 }, { "clip_ratio/high_max": 0.006110871418059105, "clip_ratio/high_mean": 0.0017305461306023062, "clip_ratio/low_mean": 0.0007836382820869403, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002514184436222422, "epoch": 0.03507363075019805, "grad_norm": 0.09852239489555359, "kl": 0.013637542724609375, "learning_rate": 1e-06, "loss": 0.0296, "step": 202 }, { "clip_ratio/high_max": 0.0067143171618226916, "clip_ratio/high_mean": 0.002016777214976173, "clip_ratio/low_mean": 0.0008447439882957042, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028615211977012223, "epoch": 0.03524726258559507, "grad_norm": 0.10066750645637512, "kl": 0.013576507568359375, "learning_rate": 1e-06, "loss": 0.0294, "step": 203 }, { "clip_ratio/high_max": 0.007235827069962397, "clip_ratio/high_mean": 0.0020889421275569475, "clip_ratio/low_mean": 0.0011157552812619542, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003204697342880536, "epoch": 0.03542089442099209, "grad_norm": 0.09108741581439972, "kl": 0.014263153076171875, "learning_rate": 1e-06, "loss": 0.0291, "step": 204 }, { "clip_ratio/high_max": 0.007691378614254063, "clip_ratio/high_mean": 0.002357282998673327, "clip_ratio/low_mean": 0.0014654204981070507, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003822703529294813, "epoch": 0.035594526256389106, "grad_norm": 0.0882318988442421, "kl": 0.014585494995117188, "learning_rate": 1e-06, "loss": 0.0288, "step": 205 }, { "clip_ratio/high_max": 0.008854923748003785, "clip_ratio/high_mean": 0.0025403192403246067, "clip_ratio/low_mean": 0.0018793898625517613, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004419709137437167, "epoch": 0.03576815809178613, "grad_norm": 0.08394334465265274, "kl": 0.015314102172851562, "learning_rate": 1e-06, "loss": 0.0285, "step": 206 }, { "clip_ratio/high_max": 0.011119187132862862, "clip_ratio/high_mean": 0.0030645387514596223, "clip_ratio/low_mean": 0.002380040307798481, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005444578953756718, "epoch": 0.03594178992718315, "grad_norm": 0.07953047007322311, "kl": 0.015575408935546875, "learning_rate": 1e-06, "loss": 0.0282, "step": 207 }, { "clip_ratio/high_max": 0.011757322630728595, "clip_ratio/high_mean": 0.0032355821476812707, "clip_ratio/low_mean": 0.0030131003568385495, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006248682562727481, "epoch": 0.03611542176258017, "grad_norm": 0.08291537314653397, "kl": 0.016530990600585938, "learning_rate": 1e-06, "loss": 0.0279, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3008.0, "completions/mean_length": 1676.462158203125, "completions/mean_terminated_length": 1118.2469482421875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.03628905359797719, "grad_norm": 0.09211947023868561, "kl": 0.016153335571289062, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 25974444.0, "reward": 0.3839285969734192, "reward_std": 0.2109421193599701, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "step": 209 }, { "clip_ratio/high_max": 0.004868702037128969, "clip_ratio/high_mean": 0.0011972069837611343, "clip_ratio/low_mean": 0.0007156190602017887, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019128260428260546, "epoch": 0.036462685433374206, "grad_norm": 0.0881376788020134, "kl": 0.016271591186523438, "learning_rate": 1e-06, "loss": 0.0279, "step": 210 }, { "clip_ratio/high_max": 0.005703013686797931, "clip_ratio/high_mean": 0.0013917315932303609, "clip_ratio/low_mean": 0.0008305928781737748, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022223244995984714, "epoch": 0.03663631726877123, "grad_norm": 0.0876634493470192, "kl": 0.016721725463867188, "learning_rate": 1e-06, "loss": 0.0277, "step": 211 }, { "clip_ratio/high_max": 0.006239787122467533, "clip_ratio/high_mean": 0.0015253543101607647, "clip_ratio/low_mean": 0.001110815766878659, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026361700911365915, "epoch": 0.03680994910416825, "grad_norm": 0.08540176600217819, "kl": 0.017267227172851562, "learning_rate": 1e-06, "loss": 0.0275, "step": 212 }, { "clip_ratio/high_max": 0.007624491225215024, "clip_ratio/high_mean": 0.001821018858208845, "clip_ratio/low_mean": 0.0013840796436852543, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003205098459147848, "epoch": 0.03698358093956527, "grad_norm": 0.08115784823894501, "kl": 0.017566680908203125, "learning_rate": 1e-06, "loss": 0.0272, "step": 213 }, { "clip_ratio/high_max": 0.00879309340234613, "clip_ratio/high_mean": 0.0021584609999081295, "clip_ratio/low_mean": 0.0016739412376409746, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003832402198895579, "epoch": 0.03715721277496229, "grad_norm": 0.08007089048624039, "kl": 0.017839431762695312, "learning_rate": 1e-06, "loss": 0.0269, "step": 214 }, { "clip_ratio/high_max": 0.009846461622146307, "clip_ratio/high_mean": 0.002476550294431945, "clip_ratio/low_mean": 0.0021181534575589467, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004594703819748247, "epoch": 0.03733084461035931, "grad_norm": 0.07569095492362976, "kl": 0.018009185791015625, "learning_rate": 1e-06, "loss": 0.0266, "step": 215 }, { "clip_ratio/high_max": 0.011361671186023159, "clip_ratio/high_mean": 0.002887388767703669, "clip_ratio/low_mean": 0.002451889884468983, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005339278777682921, "epoch": 0.03750447644575633, "grad_norm": 0.0741347074508667, "kl": 0.017885208129882812, "learning_rate": 1e-06, "loss": 0.0263, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3325892857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 1782.0469970703125, "completions/mean_terminated_length": 1139.2274169921875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.03767810828115335, "grad_norm": 0.09740474075078964, "kl": 0.017147064208984375, "learning_rate": 1e-06, "loss": 0.0304, "num_tokens": 26837577.0, "reward": 0.3906250298023224, "reward_std": 0.2525072693824768, "rewards/accuracy_reward/mean": 0.390625, "rewards/accuracy_reward/std": 0.48843589425086975, "step": 217 }, { "clip_ratio/high_max": 0.005332508637366118, "clip_ratio/high_mean": 0.001370633568512858, "clip_ratio/low_mean": 0.0007678367946937215, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021384703595686005, "epoch": 0.03785174011655037, "grad_norm": 0.09584915637969971, "kl": 0.017284393310546875, "learning_rate": 1e-06, "loss": 0.0304, "step": 218 }, { "clip_ratio/high_max": 0.005739041098422604, "clip_ratio/high_mean": 0.0014045209854884888, "clip_ratio/low_mean": 0.0010460083308316825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024505292894900776, "epoch": 0.03802537195194739, "grad_norm": 0.09607510268688202, "kl": 0.01747894287109375, "learning_rate": 1e-06, "loss": 0.0302, "step": 219 }, { "clip_ratio/high_max": 0.006547189979755785, "clip_ratio/high_mean": 0.0016914033903958625, "clip_ratio/low_mean": 0.0012982333742002083, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029896367523178924, "epoch": 0.03819900378734441, "grad_norm": 0.08867420256137848, "kl": 0.018507003784179688, "learning_rate": 1e-06, "loss": 0.0299, "step": 220 }, { "clip_ratio/high_max": 0.007556199350801762, "clip_ratio/high_mean": 0.0019446096266619861, "clip_ratio/low_mean": 0.001646923848966253, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035915335138270166, "epoch": 0.03837263562274143, "grad_norm": 0.0830821543931961, "kl": 0.018945693969726562, "learning_rate": 1e-06, "loss": 0.0297, "step": 221 }, { "clip_ratio/high_max": 0.008688813970366027, "clip_ratio/high_mean": 0.00224279583835596, "clip_ratio/low_mean": 0.002087621947794105, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0043304177779646125, "epoch": 0.03854626745813845, "grad_norm": 0.08074706047773361, "kl": 0.019781112670898438, "learning_rate": 1e-06, "loss": 0.0293, "step": 222 }, { "clip_ratio/high_max": 0.010158728233363945, "clip_ratio/high_mean": 0.002590658068584162, "clip_ratio/low_mean": 0.00267202976920089, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005262687882350292, "epoch": 0.03871989929353547, "grad_norm": 0.07953561097383499, "kl": 0.020990371704101562, "learning_rate": 1e-06, "loss": 0.029, "step": 223 }, { "clip_ratio/high_max": 0.011524382287461776, "clip_ratio/high_mean": 0.0029348107000259915, "clip_ratio/low_mean": 0.0031014906799100572, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006036301398125943, "epoch": 0.03889353112893249, "grad_norm": 0.0781690776348114, "kl": 0.021318435668945312, "learning_rate": 1e-06, "loss": 0.0288, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3325892857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 1905.1763916015625, "completions/mean_terminated_length": 1323.7156982421875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.03906716296432951, "grad_norm": 0.08692702651023865, "kl": 0.018560409545898438, "learning_rate": 1e-06, "loss": 0.0485, "num_tokens": 27765488.0, "reward": 0.3995535969734192, "reward_std": 0.257693886756897, "rewards/accuracy_reward/mean": 0.3995535671710968, "rewards/accuracy_reward/std": 0.49035418033599854, "step": 225 }, { "clip_ratio/high_max": 0.004552488593617454, "clip_ratio/high_mean": 0.0013200651642364392, "clip_ratio/low_mean": 0.0009387147081270086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002258779857584159, "epoch": 0.03924079479972653, "grad_norm": 0.08543648570775986, "kl": 0.018747329711914062, "learning_rate": 1e-06, "loss": 0.0485, "step": 226 }, { "clip_ratio/high_max": 0.0049271274037892, "clip_ratio/high_mean": 0.0014010166028128879, "clip_ratio/low_mean": 0.0011060155679842865, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025070321644307114, "epoch": 0.039414426635123546, "grad_norm": 0.08079492300748825, "kl": 0.019062042236328125, "learning_rate": 1e-06, "loss": 0.0483, "step": 227 }, { "clip_ratio/high_max": 0.0050275506800971925, "clip_ratio/high_mean": 0.0014530286371154943, "clip_ratio/low_mean": 0.0012954226458532503, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027484512429509778, "epoch": 0.03958805847052057, "grad_norm": 0.07789900153875351, "kl": 0.020151138305664062, "learning_rate": 1e-06, "loss": 0.0482, "step": 228 }, { "clip_ratio/high_max": 0.005840708276082296, "clip_ratio/high_mean": 0.0016514880203430948, "clip_ratio/low_mean": 0.0015613746854796773, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032128626626217738, "epoch": 0.03976169030591759, "grad_norm": 0.07526417821645737, "kl": 0.020353317260742188, "learning_rate": 1e-06, "loss": 0.0479, "step": 229 }, { "clip_ratio/high_max": 0.006925522851815913, "clip_ratio/high_mean": 0.001868501101853326, "clip_ratio/low_mean": 0.0020055468175996793, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038740479612897616, "epoch": 0.03993532214131461, "grad_norm": 0.07520931959152222, "kl": 0.021577835083007812, "learning_rate": 1e-06, "loss": 0.0476, "step": 230 }, { "clip_ratio/high_max": 0.007875671260990202, "clip_ratio/high_mean": 0.0021329824239728623, "clip_ratio/low_mean": 0.0023019557283987524, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004434938215126749, "epoch": 0.04010895397671163, "grad_norm": 0.0737130269408226, "kl": 0.022966384887695312, "learning_rate": 1e-06, "loss": 0.0473, "step": 231 }, { "clip_ratio/high_max": 0.008405009568377864, "clip_ratio/high_mean": 0.0022551096917595714, "clip_ratio/low_mean": 0.0027959976082456706, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005051107225881424, "epoch": 0.04028258581210865, "grad_norm": 0.07218996435403824, "kl": 0.022974014282226562, "learning_rate": 1e-06, "loss": 0.0471, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3660714285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 1915.1719970703125, "completions/mean_terminated_length": 1247.144287109375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.04045621764750567, "grad_norm": 0.07778734713792801, "kl": 0.020908355712890625, "learning_rate": 1e-06, "loss": 0.0438, "num_tokens": 28689765.0, "reward": 0.3571428656578064, "reward_std": 0.23717302083969116, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.47969305515289307, "step": 233 }, { "clip_ratio/high_max": 0.004096274522453314, "clip_ratio/high_mean": 0.0010440445271342469, "clip_ratio/low_mean": 0.0009120921677094884, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019561367080314085, "epoch": 0.04062984948290269, "grad_norm": 0.07743727415800095, "kl": 0.02142333984375, "learning_rate": 1e-06, "loss": 0.0438, "step": 234 }, { "clip_ratio/high_max": 0.003972830767452251, "clip_ratio/high_mean": 0.0010130089558515465, "clip_ratio/low_mean": 0.001060686111486575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002073695048238733, "epoch": 0.04080348131829971, "grad_norm": 0.0738179162144661, "kl": 0.02239227294921875, "learning_rate": 1e-06, "loss": 0.0437, "step": 235 }, { "clip_ratio/high_max": 0.004793579104443779, "clip_ratio/high_mean": 0.0012439648503459466, "clip_ratio/low_mean": 0.001272642019102932, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025166068371618167, "epoch": 0.04097711315369673, "grad_norm": 0.07125945389270782, "kl": 0.022857666015625, "learning_rate": 1e-06, "loss": 0.0435, "step": 236 }, { "clip_ratio/high_max": 0.005087339162855642, "clip_ratio/high_mean": 0.001317936995292257, "clip_ratio/low_mean": 0.0014809940539635136, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002798931087454548, "epoch": 0.04115074498909375, "grad_norm": 0.06937792897224426, "kl": 0.022848129272460938, "learning_rate": 1e-06, "loss": 0.0433, "step": 237 }, { "clip_ratio/high_max": 0.005826350341521902, "clip_ratio/high_mean": 0.001493196527917462, "clip_ratio/low_mean": 0.001807853019272443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003301049568108283, "epoch": 0.04132437682449077, "grad_norm": 0.06516200304031372, "kl": 0.023662567138671875, "learning_rate": 1e-06, "loss": 0.0431, "step": 238 }, { "clip_ratio/high_max": 0.00700578601390589, "clip_ratio/high_mean": 0.001755614562171104, "clip_ratio/low_mean": 0.0021548195672949078, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003910434141289443, "epoch": 0.04149800865988779, "grad_norm": 0.06389059126377106, "kl": 0.024005889892578125, "learning_rate": 1e-06, "loss": 0.0428, "step": 239 }, { "clip_ratio/high_max": 0.00805301249056356, "clip_ratio/high_mean": 0.0020288965679355897, "clip_ratio/low_mean": 0.0025113965184573317, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004540293099125847, "epoch": 0.04167164049528481, "grad_norm": 0.06284527480602264, "kl": 0.02553558349609375, "learning_rate": 1e-06, "loss": 0.0426, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 1766.7344970703125, "completions/mean_terminated_length": 1197.7724609375, "completions/min_length": 330.0, "completions/min_terminated_length": 330.0, "epoch": 0.04184527233068183, "grad_norm": 0.08844015747308731, "kl": 0.02428436279296875, "learning_rate": 1e-06, "loss": 0.031, "num_tokens": 29541262.0, "reward": 0.424107164144516, "reward_std": 0.26864972710609436, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "step": 241 }, { "clip_ratio/high_max": 0.004497351197642274, "clip_ratio/high_mean": 0.0011910576290574681, "clip_ratio/low_mean": 0.0010750420033218688, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022660996528429678, "epoch": 0.04201890416607885, "grad_norm": 0.08550579845905304, "kl": 0.024089813232421875, "learning_rate": 1e-06, "loss": 0.031, "step": 242 }, { "clip_ratio/high_max": 0.004826405602216255, "clip_ratio/high_mean": 0.0012726356490020407, "clip_ratio/low_mean": 0.0011405973895080024, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024132329708663747, "epoch": 0.04219253600147587, "grad_norm": 0.08401601016521454, "kl": 0.024364471435546875, "learning_rate": 1e-06, "loss": 0.0308, "step": 243 }, { "clip_ratio/high_max": 0.005422168735094601, "clip_ratio/high_mean": 0.0014528332721965853, "clip_ratio/low_mean": 0.0013987763818477106, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002851609664503485, "epoch": 0.042366167836872894, "grad_norm": 0.0806071013212204, "kl": 0.025005340576171875, "learning_rate": 1e-06, "loss": 0.0306, "step": 244 }, { "clip_ratio/high_max": 0.0065971098956651986, "clip_ratio/high_mean": 0.0016868205511855194, "clip_ratio/low_mean": 0.0016656252928441972, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003352445834025275, "epoch": 0.04253979967226991, "grad_norm": 0.07850795984268188, "kl": 0.025421142578125, "learning_rate": 1e-06, "loss": 0.0303, "step": 245 }, { "clip_ratio/high_max": 0.007209781568235485, "clip_ratio/high_mean": 0.001885793374185596, "clip_ratio/low_mean": 0.0021862599510313885, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004072053350682836, "epoch": 0.04271343150766693, "grad_norm": 0.07665248215198517, "kl": 0.0263214111328125, "learning_rate": 1e-06, "loss": 0.03, "step": 246 }, { "clip_ratio/high_max": 0.00824850112257991, "clip_ratio/high_mean": 0.002092805352731375, "clip_ratio/low_mean": 0.002633838314523018, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004726643648609752, "epoch": 0.04288706334306395, "grad_norm": 0.07348505407571793, "kl": 0.026905059814453125, "learning_rate": 1e-06, "loss": 0.0297, "step": 247 }, { "clip_ratio/high_max": 0.010216107482847292, "clip_ratio/high_mean": 0.0025497382166577154, "clip_ratio/low_mean": 0.0033160306184072397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005865768951480277, "epoch": 0.04306069517846097, "grad_norm": 0.06938531249761581, "kl": 0.02728271484375, "learning_rate": 1e-06, "loss": 0.0294, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3303571428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2897.0, "completions/mean_length": 1775.310302734375, "completions/mean_terminated_length": 1135.6099853515625, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "epoch": 0.043234327013857994, "grad_norm": 0.08025509119033813, "kl": 0.02388763427734375, "learning_rate": 1e-06, "loss": 0.0297, "num_tokens": 30400689.0, "reward": 0.4732142984867096, "reward_std": 0.20771192014217377, "rewards/accuracy_reward/mean": 0.4732142984867096, "rewards/accuracy_reward/std": 0.4998401999473572, "step": 249 }, { "clip_ratio/high_max": 0.00468600131716812, "clip_ratio/high_mean": 0.0012604062935679394, "clip_ratio/low_mean": 0.0006269318098475196, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018873380849981913, "epoch": 0.04340795884925501, "grad_norm": 0.07850334048271179, "kl": 0.0234222412109375, "learning_rate": 1e-06, "loss": 0.0297, "step": 250 }, { "clip_ratio/high_max": 0.005379546430049231, "clip_ratio/high_mean": 0.0014179299341776641, "clip_ratio/low_mean": 0.0007029731621059909, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002120903120157891, "epoch": 0.04358159068465203, "grad_norm": 0.07666642963886261, "kl": 0.0237274169921875, "learning_rate": 1e-06, "loss": 0.0295, "step": 251 }, { "clip_ratio/high_max": 0.005008225853089243, "clip_ratio/high_mean": 0.0013815777342642832, "clip_ratio/low_mean": 0.0009180950150948775, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002299672714798362, "epoch": 0.04375522252004905, "grad_norm": 0.07542448490858078, "kl": 0.024616241455078125, "learning_rate": 1e-06, "loss": 0.0293, "step": 252 }, { "clip_ratio/high_max": 0.005977287702989997, "clip_ratio/high_mean": 0.0016110274941638636, "clip_ratio/low_mean": 0.0012067765023857646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002817804002916091, "epoch": 0.04392885435544607, "grad_norm": 0.07272520661354065, "kl": 0.025409698486328125, "learning_rate": 1e-06, "loss": 0.0291, "step": 253 }, { "clip_ratio/high_max": 0.007121860926417867, "clip_ratio/high_mean": 0.0018848643571800494, "clip_ratio/low_mean": 0.0014941154163352621, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033789797889767215, "epoch": 0.04410248619084309, "grad_norm": 0.07068776339292526, "kl": 0.026065826416015625, "learning_rate": 1e-06, "loss": 0.0288, "step": 254 }, { "clip_ratio/high_max": 0.007464214657375123, "clip_ratio/high_mean": 0.0020125920918872, "clip_ratio/low_mean": 0.0019426869903327315, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003955279049478122, "epoch": 0.04427611802624011, "grad_norm": 0.06645433604717255, "kl": 0.02706146240234375, "learning_rate": 1e-06, "loss": 0.0286, "step": 255 }, { "clip_ratio/high_max": 0.009022290858410997, "clip_ratio/high_mean": 0.002390394306530652, "clip_ratio/low_mean": 0.002287794780841068, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004678189050537185, "epoch": 0.04444974986163713, "grad_norm": 0.06222284957766533, "kl": 0.027568817138671875, "learning_rate": 1e-06, "loss": 0.0283, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3549107142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 1853.63623046875, "completions/mean_terminated_length": 1183.3253173828125, "completions/min_length": 252.0, "completions/min_terminated_length": 252.0, "epoch": 0.04462338169703415, "grad_norm": 0.0897333025932312, "kl": 0.032238006591796875, "learning_rate": 1e-06, "loss": 0.0352, "num_tokens": 31290454.0, "reward": 0.3660714328289032, "reward_std": 0.2087731808423996, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.4822677969932556, "step": 257 }, { "clip_ratio/high_max": 0.005109682108013658, "clip_ratio/high_mean": 0.0011701928524416871, "clip_ratio/low_mean": 0.0008590378647568286, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020292306726332754, "epoch": 0.04479701353243117, "grad_norm": 0.10249108076095581, "kl": 0.029628753662109375, "learning_rate": 1e-06, "loss": 0.0352, "step": 258 }, { "clip_ratio/high_max": 0.0055341773622785695, "clip_ratio/high_mean": 0.001277970951377938, "clip_ratio/low_mean": 0.000980439177737935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002258410097056185, "epoch": 0.04497064536782819, "grad_norm": 0.08279615640640259, "kl": 0.03108978271484375, "learning_rate": 1e-06, "loss": 0.035, "step": 259 }, { "clip_ratio/high_max": 0.006623380402743351, "clip_ratio/high_mean": 0.0014605744008804322, "clip_ratio/low_mean": 0.0011489141375022882, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00260948854702292, "epoch": 0.04514427720322521, "grad_norm": 0.13437478244304657, "kl": 0.055805206298828125, "learning_rate": 1e-06, "loss": 0.0349, "step": 260 }, { "clip_ratio/high_max": 0.007549438269052189, "clip_ratio/high_mean": 0.0016944714634519187, "clip_ratio/low_mean": 0.0013699054447897652, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030643768805020954, "epoch": 0.045317909038622234, "grad_norm": 0.07547251135110855, "kl": 0.03168487548828125, "learning_rate": 1e-06, "loss": 0.0346, "step": 261 }, { "clip_ratio/high_max": 0.009956727582903113, "clip_ratio/high_mean": 0.0021436338929561316, "clip_ratio/low_mean": 0.0015240156485560874, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036676495437859558, "epoch": 0.04549154087401925, "grad_norm": 0.17749373614788055, "kl": 0.030246734619140625, "learning_rate": 1e-06, "loss": 0.0344, "step": 262 }, { "clip_ratio/high_max": 0.010581660928437486, "clip_ratio/high_mean": 0.00234993631420366, "clip_ratio/low_mean": 0.001856701342148881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004206637666356983, "epoch": 0.04566517270941627, "grad_norm": 0.24201083183288574, "kl": 0.09323883056640625, "learning_rate": 1e-06, "loss": 0.0342, "step": 263 }, { "clip_ratio/high_max": 0.011565886423340999, "clip_ratio/high_mean": 0.002520075974643987, "clip_ratio/low_mean": 0.002318985671081464, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004839061606617179, "epoch": 0.04583880454481329, "grad_norm": 0.06495387852191925, "kl": 0.033054351806640625, "learning_rate": 1e-06, "loss": 0.0338, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3191964285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 1717.5379638671875, "completions/mean_terminated_length": 1082.4951171875, "completions/min_length": 204.0, "completions/min_terminated_length": 204.0, "epoch": 0.04601243638021031, "grad_norm": 0.08803840726613998, "kl": 0.034870147705078125, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 32117903.0, "reward": 0.361607164144516, "reward_std": 0.21335135400295258, "rewards/accuracy_reward/mean": 0.3616071343421936, "rewards/accuracy_reward/std": 0.4810029864311218, "step": 265 }, { "clip_ratio/high_max": 0.004549150060483953, "clip_ratio/high_mean": 0.0011557191055544536, "clip_ratio/low_mean": 0.0007803374112427264, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019360565038368804, "epoch": 0.046186068215607334, "grad_norm": 0.08755552023649216, "kl": 0.033267974853515625, "learning_rate": 1e-06, "loss": 0.0284, "step": 266 }, { "clip_ratio/high_max": 0.0055232373415492475, "clip_ratio/high_mean": 0.0013100980436320242, "clip_ratio/low_mean": 0.0008009549296730256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021110529887664597, "epoch": 0.04635970005100435, "grad_norm": 0.08286398649215698, "kl": 0.0343170166015625, "learning_rate": 1e-06, "loss": 0.0282, "step": 267 }, { "clip_ratio/high_max": 0.0063953503195079975, "clip_ratio/high_mean": 0.0015015570656942145, "clip_ratio/low_mean": 0.0010375124461461382, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025390695336682256, "epoch": 0.04653333188640137, "grad_norm": 0.08441655337810516, "kl": 0.03447723388671875, "learning_rate": 1e-06, "loss": 0.028, "step": 268 }, { "clip_ratio/high_max": 0.007173350160883274, "clip_ratio/high_mean": 0.0016034443215175997, "clip_ratio/low_mean": 0.0013146558831067523, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029181002209952567, "epoch": 0.04670696372179839, "grad_norm": 0.07247421890497208, "kl": 0.035465240478515625, "learning_rate": 1e-06, "loss": 0.0278, "step": 269 }, { "clip_ratio/high_max": 0.00848618638701737, "clip_ratio/high_mean": 0.001912261577672325, "clip_ratio/low_mean": 0.0016346873680959106, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035469489703245927, "epoch": 0.04688059555719541, "grad_norm": 0.07031188160181046, "kl": 0.037353515625, "learning_rate": 1e-06, "loss": 0.0275, "step": 270 }, { "clip_ratio/high_max": 0.00961923376962659, "clip_ratio/high_mean": 0.0021382813297350367, "clip_ratio/low_mean": 0.0020962743774362025, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004234555701259524, "epoch": 0.047054227392592435, "grad_norm": 0.0727355107665062, "kl": 0.037532806396484375, "learning_rate": 1e-06, "loss": 0.0273, "step": 271 }, { "clip_ratio/high_max": 0.01176786868745694, "clip_ratio/high_mean": 0.0025794994635361945, "clip_ratio/low_mean": 0.002377243786213512, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004956743356160587, "epoch": 0.04722785922798945, "grad_norm": 0.0658554881811142, "kl": 0.038555145263671875, "learning_rate": 1e-06, "loss": 0.027, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2209821428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 1584.044677734375, "completions/mean_terminated_length": 1161.9599609375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.047401491063386474, "grad_norm": 0.1648370623588562, "kl": 0.06563949584960938, "learning_rate": 1e-06, "loss": 0.0272, "num_tokens": 32885531.0, "reward": 0.4441964626312256, "reward_std": 0.28459516167640686, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "step": 273 }, { "clip_ratio/high_max": 0.005250699829048244, "clip_ratio/high_mean": 0.0015376813571492676, "clip_ratio/low_mean": 0.000983728538244577, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002521409931432572, "epoch": 0.04757512289878349, "grad_norm": 12.81436824798584, "kl": 0.031848907470703125, "learning_rate": 1e-06, "loss": 0.0421, "step": 274 }, { "clip_ratio/high_max": 0.00508655186422402, "clip_ratio/high_mean": 0.001495751246693544, "clip_ratio/low_mean": 0.0011207826920553998, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026165339304498048, "epoch": 0.04774875473418051, "grad_norm": 1957.364501953125, "kl": 828.0305061340332, "learning_rate": 1e-06, "loss": 0.8562, "step": 275 }, { "clip_ratio/high_max": 0.00661788590150536, "clip_ratio/high_mean": 0.0019578086685214657, "clip_ratio/low_mean": 0.001031016005526908, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00298882473816775, "epoch": 0.047922386569577535, "grad_norm": 23.13109016418457, "kl": 15.217571258544922, "learning_rate": 1e-06, "loss": 0.0423, "step": 276 }, { "clip_ratio/high_max": 0.0072947012158692814, "clip_ratio/high_mean": 0.002008416807257163, "clip_ratio/low_mean": 0.0010965286720647782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031049454364620033, "epoch": 0.04809601840497455, "grad_norm": 0.11151262372732162, "kl": 0.056583404541015625, "learning_rate": 1e-06, "loss": 0.0271, "step": 277 }, { "clip_ratio/high_max": 0.007911000357125886, "clip_ratio/high_mean": 0.0022981930515015847, "clip_ratio/low_mean": 0.0012795379880117252, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035777310104094795, "epoch": 0.048269650240371574, "grad_norm": 3.7950191497802734, "kl": 0.0310211181640625, "learning_rate": 1e-06, "loss": 0.0289, "step": 278 }, { "clip_ratio/high_max": 0.008378271773835877, "clip_ratio/high_mean": 0.002272631729283603, "clip_ratio/low_mean": 0.0014259196100283589, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036985513179388363, "epoch": 0.04844328207576859, "grad_norm": 0.09474775940179825, "kl": 0.042537689208984375, "learning_rate": 1e-06, "loss": 0.0268, "step": 279 }, { "clip_ratio/high_max": 0.008074118250078754, "clip_ratio/high_mean": 0.0022810307091276627, "clip_ratio/low_mean": 0.0016864106978573545, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003967441383792902, "epoch": 0.04861691391116561, "grad_norm": 0.09845507889986038, "kl": 0.09454727172851562, "learning_rate": 1e-06, "loss": 0.0267, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2723214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 1487.0670166015625, "completions/mean_terminated_length": 893.9324951171875, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.04879054574656263, "grad_norm": 0.13528770208358765, "kl": 0.08474349975585938, "learning_rate": 1e-06, "loss": 0.0159, "num_tokens": 33606529.0, "reward": 0.4933035969734192, "reward_std": 0.20237557590007782, "rewards/accuracy_reward/mean": 0.4933035671710968, "rewards/accuracy_reward/std": 0.5005140900611877, "step": 281 }, { "clip_ratio/high_max": 0.005518481626495486, "clip_ratio/high_mean": 0.0013079054560876102, "clip_ratio/low_mean": 0.0006415843376998964, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019494897769618547, "epoch": 0.04896417758195965, "grad_norm": 0.10275360196828842, "kl": 0.057941436767578125, "learning_rate": 1e-06, "loss": 0.0158, "step": 282 }, { "clip_ratio/high_max": 0.005318089952197624, "clip_ratio/high_mean": 0.0012609470236384368, "clip_ratio/low_mean": 0.0008111000020107895, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020720469819934806, "epoch": 0.049137809417356675, "grad_norm": 0.0975520983338356, "kl": 0.0487060546875, "learning_rate": 1e-06, "loss": 0.0156, "step": 283 }, { "clip_ratio/high_max": 0.006777534003049368, "clip_ratio/high_mean": 0.0015024058329800027, "clip_ratio/low_mean": 0.0010438823537697317, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002546288178564282, "epoch": 0.04931144125275369, "grad_norm": 0.09427982568740845, "kl": 0.0457305908203125, "learning_rate": 1e-06, "loss": 0.0154, "step": 284 }, { "clip_ratio/high_max": 0.007698584504396422, "clip_ratio/high_mean": 0.0017372293777953018, "clip_ratio/low_mean": 0.0012986168221686967, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030358461881405674, "epoch": 0.04948507308815071, "grad_norm": 0.08655794709920883, "kl": 0.0445709228515625, "learning_rate": 1e-06, "loss": 0.015, "step": 285 }, { "clip_ratio/high_max": 0.00907187326811254, "clip_ratio/high_mean": 0.0020208473688398954, "clip_ratio/low_mean": 0.0017221434977727768, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003742990826140158, "epoch": 0.04965870492354773, "grad_norm": 0.0863230749964714, "kl": 0.042949676513671875, "learning_rate": 1e-06, "loss": 0.0148, "step": 286 }, { "clip_ratio/high_max": 0.01111614878027467, "clip_ratio/high_mean": 0.0024249475600299775, "clip_ratio/low_mean": 0.0020374952555357595, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004462442830117652, "epoch": 0.04983233675894475, "grad_norm": 0.0817715972661972, "kl": 0.04030609130859375, "learning_rate": 1e-06, "loss": 0.0145, "step": 287 }, { "clip_ratio/high_max": 0.012894197454443201, "clip_ratio/high_mean": 0.002798843489472347, "clip_ratio/low_mean": 0.0024854559501363838, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005284299561026273, "epoch": 0.050005968594341775, "grad_norm": 0.07589513063430786, "kl": 0.041049957275390625, "learning_rate": 1e-06, "loss": 0.0142, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3638392857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 1878.9107666015625, "completions/mean_terminated_length": 1196.54736328125, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "epoch": 0.05017960042973879, "grad_norm": 0.09525071829557419, "kl": 0.035015106201171875, "learning_rate": 1e-06, "loss": 0.0464, "num_tokens": 34514329.0, "reward": 0.296875, "reward_std": 0.233181893825531, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.45739173889160156, "step": 289 }, { "clip_ratio/high_max": 0.004487207468628185, "clip_ratio/high_mean": 0.001184563238894043, "clip_ratio/low_mean": 0.0008969603122750414, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020815235138798016, "epoch": 0.050353232265135814, "grad_norm": 0.09354723244905472, "kl": 0.035091400146484375, "learning_rate": 1e-06, "loss": 0.0464, "step": 290 }, { "clip_ratio/high_max": 0.0049894246130861575, "clip_ratio/high_mean": 0.001282536524513489, "clip_ratio/low_mean": 0.0009899082506308332, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022724447817381588, "epoch": 0.05052686410053283, "grad_norm": 0.09170901030302048, "kl": 0.035770416259765625, "learning_rate": 1e-06, "loss": 0.0462, "step": 291 }, { "clip_ratio/high_max": 0.005716858198866248, "clip_ratio/high_mean": 0.0014339544477479649, "clip_ratio/low_mean": 0.001138001349318074, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00257195577796665, "epoch": 0.05070049593592985, "grad_norm": 0.08835425227880478, "kl": 0.036304473876953125, "learning_rate": 1e-06, "loss": 0.046, "step": 292 }, { "clip_ratio/high_max": 0.0064092636785062496, "clip_ratio/high_mean": 0.0015636555349374248, "clip_ratio/low_mean": 0.0015383405889224377, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031019961079437053, "epoch": 0.050874127771326876, "grad_norm": 0.08421893417835236, "kl": 0.037212371826171875, "learning_rate": 1e-06, "loss": 0.0456, "step": 293 }, { "clip_ratio/high_max": 0.0076824534371553455, "clip_ratio/high_mean": 0.001847036872732133, "clip_ratio/low_mean": 0.001929277364979498, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037763142408948625, "epoch": 0.05104775960672389, "grad_norm": 0.08230450749397278, "kl": 0.038494110107421875, "learning_rate": 1e-06, "loss": 0.0454, "step": 294 }, { "clip_ratio/high_max": 0.009685077016911237, "clip_ratio/high_mean": 0.002242555319753592, "clip_ratio/low_mean": 0.002174456431021099, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004417011703480966, "epoch": 0.051221391442120914, "grad_norm": 0.07934076339006424, "kl": 0.039150238037109375, "learning_rate": 1e-06, "loss": 0.0451, "step": 295 }, { "clip_ratio/high_max": 0.010979399921779986, "clip_ratio/high_mean": 0.0025703867277115933, "clip_ratio/low_mean": 0.0026802026750374353, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005250589401839534, "epoch": 0.05139502327751793, "grad_norm": 0.07625240087509155, "kl": 0.040515899658203125, "learning_rate": 1e-06, "loss": 0.0447, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 1664.8751220703125, "completions/mean_terminated_length": 1102.0250244140625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.05156865511291495, "grad_norm": 0.10203053057193756, "kl": 0.040508270263671875, "learning_rate": 1e-06, "loss": 0.0273, "num_tokens": 35317881.0, "reward": 0.4263392984867096, "reward_std": 0.2711445093154907, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509721994400024, "step": 297 }, { "clip_ratio/high_max": 0.005975721473078011, "clip_ratio/high_mean": 0.0014749864831173909, "clip_ratio/low_mean": 0.0010361300755903358, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025111165687121684, "epoch": 0.051742286948311976, "grad_norm": 0.0957789272069931, "kl": 0.040496826171875, "learning_rate": 1e-06, "loss": 0.0272, "step": 298 }, { "clip_ratio/high_max": 0.0056106932170223445, "clip_ratio/high_mean": 0.001472067138820421, "clip_ratio/low_mean": 0.0011689389339153422, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002641006045450922, "epoch": 0.05191591878370899, "grad_norm": 0.09514080733060837, "kl": 0.041107177734375, "learning_rate": 1e-06, "loss": 0.0271, "step": 299 }, { "clip_ratio/high_max": 0.006688262987154303, "clip_ratio/high_mean": 0.0016957720431491907, "clip_ratio/low_mean": 0.0015212190446618479, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003216991084627807, "epoch": 0.052089550619106015, "grad_norm": 0.0900445505976677, "kl": 0.04221343994140625, "learning_rate": 1e-06, "loss": 0.0268, "step": 300 }, { "clip_ratio/high_max": 0.007543783336586785, "clip_ratio/high_mean": 0.0019109013228444383, "clip_ratio/low_mean": 0.0019023447857762221, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038132460358610842, "epoch": 0.05226318245450303, "grad_norm": 0.08858823776245117, "kl": 0.04288482666015625, "learning_rate": 1e-06, "loss": 0.0265, "step": 301 }, { "clip_ratio/high_max": 0.008553786639822647, "clip_ratio/high_mean": 0.0021407794101833133, "clip_ratio/low_mean": 0.002194405880800332, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004335185243689921, "epoch": 0.052436814289900054, "grad_norm": 0.08457084000110626, "kl": 0.043544769287109375, "learning_rate": 1e-06, "loss": 0.0262, "step": 302 }, { "clip_ratio/high_max": 0.010216445014521014, "clip_ratio/high_mean": 0.0025627993845773744, "clip_ratio/low_mean": 0.00257830952614313, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005141108915267978, "epoch": 0.052610446125297076, "grad_norm": 0.0818793773651123, "kl": 0.0438232421875, "learning_rate": 1e-06, "loss": 0.0258, "step": 303 }, { "clip_ratio/high_max": 0.012040919515129644, "clip_ratio/high_mean": 0.0029872908626202843, "clip_ratio/low_mean": 0.003047748863536981, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006035039761627559, "epoch": 0.05278407796069409, "grad_norm": 0.07737314701080322, "kl": 0.043659210205078125, "learning_rate": 1e-06, "loss": 0.0255, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3325892857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3054.0, "completions/mean_length": 1831.638427734375, "completions/mean_terminated_length": 1213.53173828125, "completions/min_length": 206.0, "completions/min_terminated_length": 206.0, "epoch": 0.052957709796091115, "grad_norm": 0.1093229427933693, "kl": 0.04364013671875, "learning_rate": 1e-06, "loss": 0.0433, "num_tokens": 36203727.0, "reward": 0.3549107313156128, "reward_std": 0.24085968732833862, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211319923401, "step": 305 }, { "clip_ratio/high_max": 0.004939934315189021, "clip_ratio/high_mean": 0.001170015851585049, "clip_ratio/low_mean": 0.0008859579572799703, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00205597384046996, "epoch": 0.05313134163148813, "grad_norm": 0.1118902787566185, "kl": 0.03981781005859375, "learning_rate": 1e-06, "loss": 0.0433, "step": 306 }, { "clip_ratio/high_max": 0.005350560459191911, "clip_ratio/high_mean": 0.001255496673820744, "clip_ratio/low_mean": 0.001092098210392578, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002347594885577564, "epoch": 0.053304973466885154, "grad_norm": 0.10086560249328613, "kl": 0.0471954345703125, "learning_rate": 1e-06, "loss": 0.043, "step": 307 }, { "clip_ratio/high_max": 0.0070519737200811505, "clip_ratio/high_mean": 0.001548952689518046, "clip_ratio/low_mean": 0.0012605744063876045, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002809527084536967, "epoch": 0.05347860530228217, "grad_norm": 0.10394077748060226, "kl": 0.050563812255859375, "learning_rate": 1e-06, "loss": 0.0427, "step": 308 }, { "clip_ratio/high_max": 0.009255024215235608, "clip_ratio/high_mean": 0.0019488707921482273, "clip_ratio/low_mean": 0.0016377991314584506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003586669923606678, "epoch": 0.05365223713767919, "grad_norm": 0.4983552396297455, "kl": 0.042728424072265625, "learning_rate": 1e-06, "loss": 0.0425, "step": 309 }, { "clip_ratio/high_max": 0.009962213560356759, "clip_ratio/high_mean": 0.0021321633275874774, "clip_ratio/low_mean": 0.00197062411598381, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004102787446754519, "epoch": 0.053825868973076216, "grad_norm": 282.8660888671875, "kl": 105.04236221313477, "learning_rate": 1e-06, "loss": 0.1472, "step": 310 }, { "clip_ratio/high_max": 0.01308077028079424, "clip_ratio/high_mean": 0.0026282821936547407, "clip_ratio/low_mean": 0.002352885258005699, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004981167468940839, "epoch": 0.05399950080847323, "grad_norm": 0.4189506769180298, "kl": 0.0444183349609375, "learning_rate": 1e-06, "loss": 0.0421, "step": 311 }, { "clip_ratio/high_max": 0.013844245382642839, "clip_ratio/high_mean": 0.002774031020635448, "clip_ratio/low_mean": 0.002558393183790031, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005332424174412154, "epoch": 0.054173132643870255, "grad_norm": 4.080616474151611, "kl": 0.04517364501953125, "learning_rate": 1e-06, "loss": 0.0427, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3072.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 1753.82373046875, "completions/mean_terminated_length": 1197.2603759765625, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.05434676447926727, "grad_norm": 0.08744122833013535, "kl": 0.042156219482421875, "learning_rate": 1e-06, "loss": 0.0223, "num_tokens": 37050312.0, "reward": 0.424107164144516, "reward_std": 0.21222595870494843, "rewards/accuracy_reward/mean": 0.4241071343421936, "rewards/accuracy_reward/std": 0.4947591722011566, "step": 313 }, { "clip_ratio/high_max": 0.003542871698300587, "clip_ratio/high_mean": 0.0009596789573151909, "clip_ratio/low_mean": 0.0008116384071854554, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017713173110678326, "epoch": 0.05452039631466429, "grad_norm": 0.08609718084335327, "kl": 0.042598724365234375, "learning_rate": 1e-06, "loss": 0.0224, "step": 314 }, { "clip_ratio/high_max": 0.004162313533015549, "clip_ratio/high_mean": 0.0010940209967884584, "clip_ratio/low_mean": 0.0009866756854535197, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002080696728626208, "epoch": 0.054694028150061316, "grad_norm": 0.08149557560682297, "kl": 0.043598175048828125, "learning_rate": 1e-06, "loss": 0.0222, "step": 315 }, { "clip_ratio/high_max": 0.004462535813217983, "clip_ratio/high_mean": 0.0011132742038171273, "clip_ratio/low_mean": 0.0010431421965222398, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002156416405341588, "epoch": 0.05486765998545833, "grad_norm": 0.08036191761493683, "kl": 0.04474639892578125, "learning_rate": 1e-06, "loss": 0.022, "step": 316 }, { "clip_ratio/high_max": 0.005238765395915834, "clip_ratio/high_mean": 0.0012905609614790592, "clip_ratio/low_mean": 0.0012569526738843706, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002547513646277366, "epoch": 0.055041291820855355, "grad_norm": 0.0770607590675354, "kl": 0.04560089111328125, "learning_rate": 1e-06, "loss": 0.0218, "step": 317 }, { "clip_ratio/high_max": 0.005747852377680829, "clip_ratio/high_mean": 0.001438932538349036, "clip_ratio/low_mean": 0.001529786425180646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002968718947158777, "epoch": 0.05521492365625237, "grad_norm": 0.07358980923891068, "kl": 0.047374725341796875, "learning_rate": 1e-06, "loss": 0.0216, "step": 318 }, { "clip_ratio/high_max": 0.007073239514284069, "clip_ratio/high_mean": 0.0016832210153552296, "clip_ratio/low_mean": 0.0018547766949268407, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00353799770346086, "epoch": 0.055388555491649394, "grad_norm": 0.08348195999860764, "kl": 0.057861328125, "learning_rate": 1e-06, "loss": 0.0213, "step": 319 }, { "clip_ratio/high_max": 0.008232709515141323, "clip_ratio/high_mean": 0.0019164883869962068, "clip_ratio/low_mean": 0.0023353125634457683, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00425180090496724, "epoch": 0.05556218732704642, "grad_norm": 0.07066477090120316, "kl": 0.050518035888671875, "learning_rate": 1e-06, "loss": 0.0211, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2544642857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3037.0, "completions/mean_length": 1584.497802734375, "completions/mean_terminated_length": 1076.7874755859375, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.05573581916244343, "grad_norm": 0.10790194571018219, "kl": 0.07151031494140625, "learning_rate": 1e-06, "loss": 0.0168, "num_tokens": 37820207.0, "reward": 0.3973214328289032, "reward_std": 0.21703942120075226, "rewards/accuracy_reward/mean": 0.3973214328289032, "rewards/accuracy_reward/std": 0.48989057540893555, "step": 321 }, { "clip_ratio/high_max": 0.005183204040804412, "clip_ratio/high_mean": 0.001196277377403021, "clip_ratio/low_mean": 0.0008955024732131278, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020917798319715075, "epoch": 0.055909450997840456, "grad_norm": 0.10144967585802078, "kl": 0.06662750244140625, "learning_rate": 1e-06, "loss": 0.0168, "step": 322 }, { "clip_ratio/high_max": 0.005564189996221103, "clip_ratio/high_mean": 0.001242121808900265, "clip_ratio/low_mean": 0.0010646717091731261, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023067934980645077, "epoch": 0.05608308283323747, "grad_norm": 0.09740137308835983, "kl": 0.064361572265625, "learning_rate": 1e-06, "loss": 0.0166, "step": 323 }, { "clip_ratio/high_max": 0.006562176367879147, "clip_ratio/high_mean": 0.0014994810085227073, "clip_ratio/low_mean": 0.0012264480201338301, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002725929030930274, "epoch": 0.056256714668634494, "grad_norm": 0.09444169700145721, "kl": 0.062103271484375, "learning_rate": 1e-06, "loss": 0.0163, "step": 324 }, { "clip_ratio/high_max": 0.008055319245613646, "clip_ratio/high_mean": 0.0018195765560449217, "clip_ratio/low_mean": 0.0014550009054801194, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032745774387876736, "epoch": 0.05643034650403152, "grad_norm": 0.09070952236652374, "kl": 0.0597381591796875, "learning_rate": 1e-06, "loss": 0.016, "step": 325 }, { "clip_ratio/high_max": 0.009569266367179807, "clip_ratio/high_mean": 0.002077107304103265, "clip_ratio/low_mean": 0.0017930963122125831, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003870203629048774, "epoch": 0.05660397833942853, "grad_norm": 0.08731154352426529, "kl": 0.060394287109375, "learning_rate": 1e-06, "loss": 0.0157, "step": 326 }, { "clip_ratio/high_max": 0.010980385282891802, "clip_ratio/high_mean": 0.0024072208061625133, "clip_ratio/low_mean": 0.0022245814479902037, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004631802205040003, "epoch": 0.056777610174825556, "grad_norm": 0.08380023390054703, "kl": 0.0583648681640625, "learning_rate": 1e-06, "loss": 0.0154, "step": 327 }, { "clip_ratio/high_max": 0.012716443383396836, "clip_ratio/high_mean": 0.002804416733852122, "clip_ratio/low_mean": 0.00255477804421389, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005359194899938302, "epoch": 0.05695124201022257, "grad_norm": 0.0786973163485527, "kl": 0.0587158203125, "learning_rate": 1e-06, "loss": 0.0151, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 1768.1251220703125, "completions/mean_terminated_length": 1199.769287109375, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.057124873845619595, "grad_norm": 0.09932158887386322, "kl": 0.061847686767578125, "learning_rate": 1e-06, "loss": 0.0365, "num_tokens": 38676263.0, "reward": 0.3638392984867096, "reward_std": 0.24513620138168335, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "step": 329 }, { "clip_ratio/high_max": 0.005129917379235849, "clip_ratio/high_mean": 0.00130367848169044, "clip_ratio/low_mean": 0.0009259659404960985, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002229644427643507, "epoch": 0.05729850568101662, "grad_norm": 0.09222784638404846, "kl": 0.06255340576171875, "learning_rate": 1e-06, "loss": 0.0365, "step": 330 }, { "clip_ratio/high_max": 0.005561808105994714, "clip_ratio/high_mean": 0.0014707320333400276, "clip_ratio/low_mean": 0.001004757358941788, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024754893929639366, "epoch": 0.057472137516413634, "grad_norm": 0.091619111597538, "kl": 0.061183929443359375, "learning_rate": 1e-06, "loss": 0.0363, "step": 331 }, { "clip_ratio/high_max": 0.0063105123554123566, "clip_ratio/high_mean": 0.0016406525055572274, "clip_ratio/low_mean": 0.0012502775484790618, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002890930056310026, "epoch": 0.057645769351810656, "grad_norm": 0.08693597465753555, "kl": 0.058380126953125, "learning_rate": 1e-06, "loss": 0.0361, "step": 332 }, { "clip_ratio/high_max": 0.007050618758512428, "clip_ratio/high_mean": 0.0018210350508525153, "clip_ratio/low_mean": 0.0016098665428216918, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034309015718463343, "epoch": 0.05781940118720767, "grad_norm": 0.08769677579402924, "kl": 0.05838775634765625, "learning_rate": 1e-06, "loss": 0.0358, "step": 333 }, { "clip_ratio/high_max": 0.007842807102861116, "clip_ratio/high_mean": 0.0020333841716819734, "clip_ratio/low_mean": 0.0020482436630118173, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004081627765117446, "epoch": 0.057993033022604695, "grad_norm": 0.08194868266582489, "kl": 0.0645294189453125, "learning_rate": 1e-06, "loss": 0.0356, "step": 334 }, { "clip_ratio/high_max": 0.009304997067374643, "clip_ratio/high_mean": 0.002345988797969767, "clip_ratio/low_mean": 0.002435989966215857, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0047819787250773516, "epoch": 0.05816666485800171, "grad_norm": 0.07955046743154526, "kl": 0.06250762939453125, "learning_rate": 1e-06, "loss": 0.0353, "step": 335 }, { "clip_ratio/high_max": 0.011086840779171325, "clip_ratio/high_mean": 0.0028061282137059607, "clip_ratio/low_mean": 0.0025723850367285195, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005378513211326208, "epoch": 0.058340296693398734, "grad_norm": 0.07680052518844604, "kl": 0.06483840942382812, "learning_rate": 1e-06, "loss": 0.0349, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3370535714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3025.0, "completions/mean_length": 1862.55810546875, "completions/mean_terminated_length": 1247.656494140625, "completions/min_length": 216.0, "completions/min_terminated_length": 216.0, "epoch": 0.05851392852879576, "grad_norm": 0.11824160069227219, "kl": 0.051666259765625, "learning_rate": 1e-06, "loss": 0.0093, "num_tokens": 39575993.0, "reward": 0.2946428656578064, "reward_std": 0.2437918335199356, "rewards/accuracy_reward/mean": 0.2946428656578064, "rewards/accuracy_reward/std": 0.45639169216156006, "step": 337 }, { "clip_ratio/high_max": 0.005806953340652399, "clip_ratio/high_mean": 0.0014801576180616394, "clip_ratio/low_mean": 0.0009797865106975223, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002459944125803304, "epoch": 0.05868756036419277, "grad_norm": 0.10983967036008835, "kl": 0.05157470703125, "learning_rate": 1e-06, "loss": 0.0093, "step": 338 }, { "clip_ratio/high_max": 0.005983757961075753, "clip_ratio/high_mean": 0.0015946907242323505, "clip_ratio/low_mean": 0.001077943986729224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00267263474779611, "epoch": 0.058861192199589796, "grad_norm": 0.1059994027018547, "kl": 0.050994873046875, "learning_rate": 1e-06, "loss": 0.0091, "step": 339 }, { "clip_ratio/high_max": 0.007282030906935688, "clip_ratio/high_mean": 0.001851966442700359, "clip_ratio/low_mean": 0.0012768252645400935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031287917190638836, "epoch": 0.05903482403498681, "grad_norm": 0.10019855201244354, "kl": 0.05113983154296875, "learning_rate": 1e-06, "loss": 0.0088, "step": 340 }, { "clip_ratio/high_max": 0.009089829785807524, "clip_ratio/high_mean": 0.0021552167363552144, "clip_ratio/low_mean": 0.0015984280644261162, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003753644888092822, "epoch": 0.059208455870383835, "grad_norm": 0.09835273027420044, "kl": 0.05435943603515625, "learning_rate": 1e-06, "loss": 0.0085, "step": 341 }, { "clip_ratio/high_max": 0.01043886811385164, "clip_ratio/high_mean": 0.002440083650071756, "clip_ratio/low_mean": 0.0021223132862360217, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004562396964502113, "epoch": 0.05938208770578086, "grad_norm": 0.0889163464307785, "kl": 0.0561676025390625, "learning_rate": 1e-06, "loss": 0.0081, "step": 342 }, { "clip_ratio/high_max": 0.01197832818434108, "clip_ratio/high_mean": 0.0028186959079903318, "clip_ratio/low_mean": 0.0026664794104362954, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005485175331159553, "epoch": 0.05955571954117787, "grad_norm": 0.08854866772890091, "kl": 0.059234619140625, "learning_rate": 1e-06, "loss": 0.0078, "step": 343 }, { "clip_ratio/high_max": 0.014088095267652534, "clip_ratio/high_mean": 0.003320563686429523, "clip_ratio/low_mean": 0.0030789723696216242, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0063995361924753524, "epoch": 0.059729351376574896, "grad_norm": 0.08377635478973389, "kl": 0.0592498779296875, "learning_rate": 1e-06, "loss": 0.0075, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 2971.0, "completions/mean_length": 1549.4866943359375, "completions/mean_terminated_length": 1111.9827880859375, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.05990298321197191, "grad_norm": 0.12456328421831131, "kl": 0.05512237548828125, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 40332787.0, "reward": 0.4575892984867096, "reward_std": 0.2685869336128235, "rewards/accuracy_reward/mean": 0.4575892984867096, "rewards/accuracy_reward/std": 0.4987550377845764, "step": 345 }, { "clip_ratio/high_max": 0.006239243182790233, "clip_ratio/high_mean": 0.0016543864949198905, "clip_ratio/low_mean": 0.0008503972799189796, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002504783755284734, "epoch": 0.060076615047368935, "grad_norm": 0.11726190149784088, "kl": 0.0535888671875, "learning_rate": 1e-06, "loss": 0.0261, "step": 346 }, { "clip_ratio/high_max": 0.0065401058782299515, "clip_ratio/high_mean": 0.0017650371282798005, "clip_ratio/low_mean": 0.001207578578942048, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029726157245022478, "epoch": 0.06025024688276596, "grad_norm": 0.11028710007667542, "kl": 0.055084228515625, "learning_rate": 1e-06, "loss": 0.0258, "step": 347 }, { "clip_ratio/high_max": 0.007494226592825726, "clip_ratio/high_mean": 0.002080778314393683, "clip_ratio/low_mean": 0.0014176345230225706, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003498412837871001, "epoch": 0.060423878718162974, "grad_norm": 0.10212674736976624, "kl": 0.0569000244140625, "learning_rate": 1e-06, "loss": 0.0255, "step": 348 }, { "clip_ratio/high_max": 0.008175398485036567, "clip_ratio/high_mean": 0.0023095337978702446, "clip_ratio/low_mean": 0.0018283587187397643, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004137892494327389, "epoch": 0.06059751055356, "grad_norm": 0.10606644302606583, "kl": 0.05809783935546875, "learning_rate": 1e-06, "loss": 0.0252, "step": 349 }, { "clip_ratio/high_max": 0.010218370902293827, "clip_ratio/high_mean": 0.0027977923627986456, "clip_ratio/low_mean": 0.002148048919480061, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004945841366861714, "epoch": 0.06077114238895701, "grad_norm": 0.10671241581439972, "kl": 0.058868408203125, "learning_rate": 1e-06, "loss": 0.0248, "step": 350 }, { "clip_ratio/high_max": 0.011109457878774265, "clip_ratio/high_mean": 0.0030712041407241486, "clip_ratio/low_mean": 0.002741143920502509, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005812348052131711, "epoch": 0.060944774224354036, "grad_norm": 0.0897054672241211, "kl": 0.0602264404296875, "learning_rate": 1e-06, "loss": 0.0244, "step": 351 }, { "clip_ratio/high_max": 0.012300086476898286, "clip_ratio/high_mean": 0.003504161909859249, "clip_ratio/low_mean": 0.00340957934713515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006913741261087125, "epoch": 0.06111840605975106, "grad_norm": 0.08805359154939651, "kl": 0.06313323974609375, "learning_rate": 1e-06, "loss": 0.024, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3258928571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3021.0, "completions/mean_length": 1755.8751220703125, "completions/mean_terminated_length": 1119.6026611328125, "completions/min_length": 280.0, "completions/min_terminated_length": 280.0, "epoch": 0.061292037895148074, "grad_norm": 0.10731685161590576, "kl": 0.06757354736328125, "learning_rate": 1e-06, "loss": 0.0348, "num_tokens": 41186867.0, "reward": 0.3236607313156128, "reward_std": 0.1697656363248825, "rewards/accuracy_reward/mean": 0.3295454680919647, "rewards/accuracy_reward/std": 0.4705831706523895, "step": 353 }, { "clip_ratio/high_max": 0.005336291289495421, "clip_ratio/high_mean": 0.0010659901713552244, "clip_ratio/low_mean": 0.0007731101743502222, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001839100398683513, "epoch": 0.0614656697305451, "grad_norm": 0.09922567754983902, "kl": 0.069580078125, "learning_rate": 1e-06, "loss": 0.0348, "step": 354 }, { "clip_ratio/high_max": 0.006009303071550676, "clip_ratio/high_mean": 0.001183046771075169, "clip_ratio/low_mean": 0.0008966304349087295, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002079677195979457, "epoch": 0.06163930156594211, "grad_norm": 0.0945262685418129, "kl": 0.066741943359375, "learning_rate": 1e-06, "loss": 0.0346, "step": 355 }, { "clip_ratio/high_max": 0.007019474067419651, "clip_ratio/high_mean": 0.0013789934214401, "clip_ratio/low_mean": 0.0009452451158722397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023242385304911295, "epoch": 0.061812933401339136, "grad_norm": 0.08904973417520523, "kl": 0.06381988525390625, "learning_rate": 1e-06, "loss": 0.0344, "step": 356 }, { "clip_ratio/high_max": 0.007962949508510064, "clip_ratio/high_mean": 0.0015670042944293527, "clip_ratio/low_mean": 0.00135196078736044, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00291896506678313, "epoch": 0.06198656523673616, "grad_norm": 0.0841624066233635, "kl": 0.066802978515625, "learning_rate": 1e-06, "loss": 0.0342, "step": 357 }, { "clip_ratio/high_max": 0.00995414912540582, "clip_ratio/high_mean": 0.001884872958271444, "clip_ratio/low_mean": 0.0017205541898874799, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00360542713679024, "epoch": 0.062160197072133175, "grad_norm": 0.07821935415267944, "kl": 0.067169189453125, "learning_rate": 1e-06, "loss": 0.0339, "step": 358 }, { "clip_ratio/high_max": 0.011284288408205612, "clip_ratio/high_mean": 0.0022033245686543523, "clip_ratio/low_mean": 0.00219525085958594, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004398575447339681, "epoch": 0.0623338289075302, "grad_norm": 0.07185030728578568, "kl": 0.06600189208984375, "learning_rate": 1e-06, "loss": 0.0337, "step": 359 }, { "clip_ratio/high_max": 0.013932729296357138, "clip_ratio/high_mean": 0.0026642824541340815, "clip_ratio/low_mean": 0.0024918507942857104, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005156133243872318, "epoch": 0.06250746074292722, "grad_norm": 0.06902019679546356, "kl": 0.06565093994140625, "learning_rate": 1e-06, "loss": 0.0334, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1919642857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 1401.5201416015625, "completions/mean_terminated_length": 1004.665771484375, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.06268109257832423, "grad_norm": 0.11523715406656265, "kl": 0.0829925537109375, "learning_rate": 1e-06, "loss": 0.042, "num_tokens": 41874084.0, "reward": 0.4843750298023224, "reward_std": 0.22213825583457947, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.500314474105835, "step": 361 }, { "clip_ratio/high_max": 0.0056194695898739155, "clip_ratio/high_mean": 0.0012735780810544384, "clip_ratio/low_mean": 0.0008724085982976248, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021459866566146957, "epoch": 0.06285472441372125, "grad_norm": 0.6664465069770813, "kl": 0.0663299560546875, "learning_rate": 1e-06, "loss": 0.0421, "step": 362 }, { "clip_ratio/high_max": 0.006043556735676248, "clip_ratio/high_mean": 0.0014018746867350274, "clip_ratio/low_mean": 0.0012412539001616096, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026431285914441105, "epoch": 0.06302835624911828, "grad_norm": 45.75358963012695, "kl": 33.031005859375, "learning_rate": 1e-06, "loss": 0.0748, "step": 363 }, { "clip_ratio/high_max": 0.006670735321677057, "clip_ratio/high_mean": 0.0015709793542555417, "clip_ratio/low_mean": 0.0013159657996766327, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028869451889477205, "epoch": 0.0632019880845153, "grad_norm": 0.30057692527770996, "kl": 0.22634124755859375, "learning_rate": 1e-06, "loss": 0.0418, "step": 364 }, { "clip_ratio/high_max": 0.008239327155024512, "clip_ratio/high_mean": 0.0019023175782422186, "clip_ratio/low_mean": 0.0015305559463740792, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034328734727751, "epoch": 0.06337561991991232, "grad_norm": 29.514829635620117, "kl": 0.06683349609375, "learning_rate": 1e-06, "loss": 0.0536, "step": 365 }, { "clip_ratio/high_max": 0.009259790575015359, "clip_ratio/high_mean": 0.0021295798183018633, "clip_ratio/low_mean": 0.001880440612694656, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0040100204532791395, "epoch": 0.06354925175530933, "grad_norm": 0.41185325384140015, "kl": 0.07059478759765625, "learning_rate": 1e-06, "loss": 0.0416, "step": 366 }, { "clip_ratio/high_max": 0.010057051385956584, "clip_ratio/high_mean": 0.0022558945820492227, "clip_ratio/low_mean": 0.0022236469749259413, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004479541497858008, "epoch": 0.06372288359070635, "grad_norm": 0.10632655769586563, "kl": 0.1041107177734375, "learning_rate": 1e-06, "loss": 0.0412, "step": 367 }, { "clip_ratio/high_max": 0.010375847443356179, "clip_ratio/high_mean": 0.0024142942056641914, "clip_ratio/low_mean": 0.002282255377394904, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004696549531217897, "epoch": 0.06389651542610338, "grad_norm": 0.19416755437850952, "kl": 0.25826263427734375, "learning_rate": 1e-06, "loss": 0.0411, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4642857142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 2043.587158203125, "completions/mean_terminated_length": 1152.2958984375, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "epoch": 0.0640701472615004, "grad_norm": 0.06686825305223465, "kl": 0.0762481689453125, "learning_rate": 1e-06, "loss": 0.0166, "num_tokens": 42855523.0, "reward": 0.2566964328289032, "reward_std": 0.14270874857902527, "rewards/accuracy_reward/mean": 0.2566964328289032, "rewards/accuracy_reward/std": 0.4372987747192383, "step": 369 }, { "clip_ratio/high_max": 0.0035479452872095862, "clip_ratio/high_mean": 0.0006658306990630081, "clip_ratio/low_mean": 0.0004440160651029146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011098467721240013, "epoch": 0.06424377909689742, "grad_norm": 0.06471388041973114, "kl": 0.0737457275390625, "learning_rate": 1e-06, "loss": 0.0166, "step": 370 }, { "clip_ratio/high_max": 0.003913963300874457, "clip_ratio/high_mean": 0.0007407081479868793, "clip_ratio/low_mean": 0.00044094154770846217, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011816497067229648, "epoch": 0.06441741093229443, "grad_norm": 0.0663066953420639, "kl": 0.07201385498046875, "learning_rate": 1e-06, "loss": 0.0165, "step": 371 }, { "clip_ratio/high_max": 0.0039808220135455485, "clip_ratio/high_mean": 0.0007421798409268376, "clip_ratio/low_mean": 0.0005554691433644621, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0012976489815628156, "epoch": 0.06459104276769145, "grad_norm": 0.06176936253905296, "kl": 0.0745086669921875, "learning_rate": 1e-06, "loss": 0.0164, "step": 372 }, { "clip_ratio/high_max": 0.004743984176457161, "clip_ratio/high_mean": 0.0008880635232344503, "clip_ratio/low_mean": 0.0007049805333281256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001593044052242476, "epoch": 0.06476467460308848, "grad_norm": 0.055553264915943146, "kl": 0.077972412109375, "learning_rate": 1e-06, "loss": 0.0164, "step": 373 }, { "clip_ratio/high_max": 0.005190384817979066, "clip_ratio/high_mean": 0.000992191310160706, "clip_ratio/low_mean": 0.0007951772818159952, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017873686210805317, "epoch": 0.0649383064384855, "grad_norm": 0.053692642599344254, "kl": 0.07778167724609375, "learning_rate": 1e-06, "loss": 0.0162, "step": 374 }, { "clip_ratio/high_max": 0.005839945952175185, "clip_ratio/high_mean": 0.001111233870688011, "clip_ratio/low_mean": 0.0009334815604233881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002044715419287968, "epoch": 0.06511193827388252, "grad_norm": 0.05098182335495949, "kl": 0.07451629638671875, "learning_rate": 1e-06, "loss": 0.0161, "step": 375 }, { "clip_ratio/high_max": 0.006368879301589914, "clip_ratio/high_mean": 0.001204879712076945, "clip_ratio/low_mean": 0.001112594772621378, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023174744837888284, "epoch": 0.06528557010927953, "grad_norm": 0.053060829639434814, "kl": 0.07613372802734375, "learning_rate": 1e-06, "loss": 0.016, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2522321428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2939.0, "completions/mean_length": 1581.46435546875, "completions/mean_terminated_length": 1078.6865234375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.06545920194467655, "grad_norm": 0.10589797794818878, "kl": 0.06070709228515625, "learning_rate": 1e-06, "loss": 0.0193, "num_tokens": 43626595.0, "reward": 0.377232164144516, "reward_std": 0.25175273418426514, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.4852356016635895, "step": 377 }, { "clip_ratio/high_max": 0.0056065171520458534, "clip_ratio/high_mean": 0.001444566271857184, "clip_ratio/low_mean": 0.0008282972466986394, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002272863524922286, "epoch": 0.06563283378007358, "grad_norm": 0.09869330376386642, "kl": 0.06194305419921875, "learning_rate": 1e-06, "loss": 0.0193, "step": 378 }, { "clip_ratio/high_max": 0.006237901616259478, "clip_ratio/high_mean": 0.0015920653704597498, "clip_ratio/low_mean": 0.0009404807206010446, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002532546073780395, "epoch": 0.0658064656154706, "grad_norm": 0.0979112982749939, "kl": 0.0598602294921875, "learning_rate": 1e-06, "loss": 0.0191, "step": 379 }, { "clip_ratio/high_max": 0.006491790987638524, "clip_ratio/high_mean": 0.001700101104688656, "clip_ratio/low_mean": 0.0011462597975651079, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028463608505262528, "epoch": 0.06598009745086762, "grad_norm": 0.09125732630491257, "kl": 0.06139373779296875, "learning_rate": 1e-06, "loss": 0.0188, "step": 380 }, { "clip_ratio/high_max": 0.0067740414961008355, "clip_ratio/high_mean": 0.0018453716929798247, "clip_ratio/low_mean": 0.0014810354673500115, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003326407193526393, "epoch": 0.06615372928626463, "grad_norm": 0.08919871598482132, "kl": 0.0657501220703125, "learning_rate": 1e-06, "loss": 0.0186, "step": 381 }, { "clip_ratio/high_max": 0.009388737234985456, "clip_ratio/high_mean": 0.00232442565720703, "clip_ratio/low_mean": 0.0016376238672819454, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003962049515394028, "epoch": 0.06632736112166165, "grad_norm": 0.08604692667722702, "kl": 0.0671844482421875, "learning_rate": 1e-06, "loss": 0.0183, "step": 382 }, { "clip_ratio/high_max": 0.010451542606460862, "clip_ratio/high_mean": 0.0026782474542415002, "clip_ratio/low_mean": 0.0019654905572679127, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0046437380187853705, "epoch": 0.06650099295705868, "grad_norm": 0.08209864050149918, "kl": 0.06949615478515625, "learning_rate": 1e-06, "loss": 0.018, "step": 383 }, { "clip_ratio/high_max": 0.012169826237368397, "clip_ratio/high_mean": 0.003002590359756141, "clip_ratio/low_mean": 0.0025111702425419935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005513760610483587, "epoch": 0.0666746247924557, "grad_norm": 0.07388481497764587, "kl": 0.07000732421875, "learning_rate": 1e-06, "loss": 0.0177, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4017857142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 2005.49560546875, "completions/mean_terminated_length": 1289.1865234375, "completions/min_length": 291.0, "completions/min_terminated_length": 291.0, "epoch": 0.06684825662785272, "grad_norm": 0.0914219319820404, "kl": 0.05912017822265625, "learning_rate": 1e-06, "loss": 0.0419, "num_tokens": 44591433.0, "reward": 0.3683035969734192, "reward_std": 0.26309582591056824, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "step": 385 }, { "clip_ratio/high_max": 0.004525319684034912, "clip_ratio/high_mean": 0.0012098232691641897, "clip_ratio/low_mean": 0.0008808956627035514, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020907189064018894, "epoch": 0.06702188846324973, "grad_norm": 0.0895196795463562, "kl": 0.0606231689453125, "learning_rate": 1e-06, "loss": 0.0419, "step": 386 }, { "clip_ratio/high_max": 0.004964445761288516, "clip_ratio/high_mean": 0.0013150230834071408, "clip_ratio/low_mean": 0.0010096323812831542, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00232465545013838, "epoch": 0.06719552029864675, "grad_norm": 0.08634825050830841, "kl": 0.06241607666015625, "learning_rate": 1e-06, "loss": 0.0418, "step": 387 }, { "clip_ratio/high_max": 0.006175282138428884, "clip_ratio/high_mean": 0.0015397766801470425, "clip_ratio/low_mean": 0.0011990216971753398, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00273879835367552, "epoch": 0.06736915213404378, "grad_norm": 0.08219541609287262, "kl": 0.0618896484375, "learning_rate": 1e-06, "loss": 0.0416, "step": 388 }, { "clip_ratio/high_max": 0.006398503435775638, "clip_ratio/high_mean": 0.0016829056739879888, "clip_ratio/low_mean": 0.0014244952853914583, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003107400938461069, "epoch": 0.0675427839694408, "grad_norm": 0.08105918020009995, "kl": 0.0669097900390625, "learning_rate": 1e-06, "loss": 0.0414, "step": 389 }, { "clip_ratio/high_max": 0.00798079772357596, "clip_ratio/high_mean": 0.0019763343498198083, "clip_ratio/low_mean": 0.001719651831081137, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003695986193633871, "epoch": 0.06771641580483782, "grad_norm": 0.08408187329769135, "kl": 0.0674591064453125, "learning_rate": 1e-06, "loss": 0.0411, "step": 390 }, { "clip_ratio/high_max": 0.0092309706233209, "clip_ratio/high_mean": 0.0022683374209009344, "clip_ratio/low_mean": 0.0019801971830020193, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004248534580256091, "epoch": 0.06789004764023483, "grad_norm": 0.0758381187915802, "kl": 0.06653594970703125, "learning_rate": 1e-06, "loss": 0.0409, "step": 391 }, { "clip_ratio/high_max": 0.010766991777927615, "clip_ratio/high_mean": 0.0025997662232839502, "clip_ratio/low_mean": 0.0022819040495960508, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004881670298345853, "epoch": 0.06806367947563186, "grad_norm": 0.073980912566185, "kl": 0.06707763671875, "learning_rate": 1e-06, "loss": 0.0406, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3258928571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 1844.83935546875, "completions/mean_terminated_length": 1251.576171875, "completions/min_length": 253.0, "completions/min_terminated_length": 253.0, "epoch": 0.06823731131102888, "grad_norm": 0.1860767900943756, "kl": 0.1170654296875, "learning_rate": 1e-06, "loss": 0.0269, "num_tokens": 45480945.0, "reward": 0.3705357313156128, "reward_std": 0.2506236732006073, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.48348814249038696, "step": 393 }, { "clip_ratio/high_max": 0.004446897877642186, "clip_ratio/high_mean": 0.0011276168784206675, "clip_ratio/low_mean": 0.0009642268082643568, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002091843700327445, "epoch": 0.0684109431464259, "grad_norm": 0.5210447311401367, "kl": 0.06432342529296875, "learning_rate": 1e-06, "loss": 0.027, "step": 394 }, { "clip_ratio/high_max": 0.004893715995422099, "clip_ratio/high_mean": 0.0012040810097460053, "clip_ratio/low_mean": 0.00103967309496511, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002243754081064253, "epoch": 0.06858457498182292, "grad_norm": 1.2423468828201294, "kl": 0.5592803955078125, "learning_rate": 1e-06, "loss": 0.0272, "step": 395 }, { "clip_ratio/high_max": 0.006172215838887496, "clip_ratio/high_mean": 0.0015254543736773485, "clip_ratio/low_mean": 0.001030057388561545, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002555511700848001, "epoch": 0.06875820681721993, "grad_norm": 2.6604764461517334, "kl": 0.062103271484375, "learning_rate": 1e-06, "loss": 0.0274, "step": 396 }, { "clip_ratio/high_max": 0.006560977159097092, "clip_ratio/high_mean": 0.0016478731686220272, "clip_ratio/low_mean": 0.0011343484165990958, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002782221599773038, "epoch": 0.06893183865261696, "grad_norm": 0.11115654557943344, "kl": 0.06381988525390625, "learning_rate": 1e-06, "loss": 0.0265, "step": 397 }, { "clip_ratio/high_max": 0.007241135281219613, "clip_ratio/high_mean": 0.001726880980640999, "clip_ratio/low_mean": 0.001442212983420177, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003169093908581999, "epoch": 0.06910547048801398, "grad_norm": 0.11702805012464523, "kl": 0.11576080322265625, "learning_rate": 1e-06, "loss": 0.0263, "step": 398 }, { "clip_ratio/high_max": 0.008242613443144364, "clip_ratio/high_mean": 0.0018720187035796698, "clip_ratio/low_mean": 0.0017030672652253998, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035750859715335537, "epoch": 0.069279102323411, "grad_norm": 0.13670207560062408, "kl": 0.1522369384765625, "learning_rate": 1e-06, "loss": 0.0261, "step": 399 }, { "clip_ratio/high_max": 0.008861112179147312, "clip_ratio/high_mean": 0.0020766100869877846, "clip_ratio/low_mean": 0.001988591576264298, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004065201654157136, "epoch": 0.06945273415880802, "grad_norm": 0.12378508597612381, "kl": 0.1538848876953125, "learning_rate": 1e-06, "loss": 0.0258, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2142857142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 1436.555908203125, "completions/mean_terminated_length": 990.5255737304688, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.06962636599420503, "grad_norm": 0.11716204881668091, "kl": 0.0779571533203125, "learning_rate": 1e-06, "loss": 0.032, "num_tokens": 46190034.0, "reward": 0.4508928656578064, "reward_std": 0.2621201276779175, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "step": 401 }, { "clip_ratio/high_max": 0.005251561226032209, "clip_ratio/high_mean": 0.0014291140587374684, "clip_ratio/low_mean": 0.0011293792681499326, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025584933209756855, "epoch": 0.06979999782960206, "grad_norm": 0.1121266707777977, "kl": 0.07659912109375, "learning_rate": 1e-06, "loss": 0.032, "step": 402 }, { "clip_ratio/high_max": 0.00638071569846943, "clip_ratio/high_mean": 0.0016256692124443362, "clip_ratio/low_mean": 0.0012098188158233825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002835488010532572, "epoch": 0.06997362966499908, "grad_norm": 0.10640721023082733, "kl": 0.081573486328125, "learning_rate": 1e-06, "loss": 0.0318, "step": 403 }, { "clip_ratio/high_max": 0.006190811058331747, "clip_ratio/high_mean": 0.001618806746591872, "clip_ratio/low_mean": 0.001648120659410779, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003266927365984884, "epoch": 0.0701472615003961, "grad_norm": 0.10140865296125412, "kl": 0.0814971923828125, "learning_rate": 1e-06, "loss": 0.0315, "step": 404 }, { "clip_ratio/high_max": 0.00787127068178961, "clip_ratio/high_mean": 0.0020247805214239634, "clip_ratio/low_mean": 0.0019185749197276891, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003943355453884578, "epoch": 0.07032089333579311, "grad_norm": 0.09799885749816895, "kl": 0.08504486083984375, "learning_rate": 1e-06, "loss": 0.0311, "step": 405 }, { "clip_ratio/high_max": 0.008386557405174244, "clip_ratio/high_mean": 0.002178767380428326, "clip_ratio/low_mean": 0.0023982791226444533, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0045770465203531785, "epoch": 0.07049452517119013, "grad_norm": 0.09074940532445908, "kl": 0.08611297607421875, "learning_rate": 1e-06, "loss": 0.0309, "step": 406 }, { "clip_ratio/high_max": 0.011373999273928348, "clip_ratio/high_mean": 0.0028028456445099437, "clip_ratio/low_mean": 0.0027801381447716267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005582983763815719, "epoch": 0.07066815700658716, "grad_norm": 0.08528374135494232, "kl": 0.08708953857421875, "learning_rate": 1e-06, "loss": 0.0305, "step": 407 }, { "clip_ratio/high_max": 0.012452576243958902, "clip_ratio/high_mean": 0.003037203515305009, "clip_ratio/low_mean": 0.0033681667846394703, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006405370211723493, "epoch": 0.07084178884198418, "grad_norm": 0.08933061361312866, "kl": 0.086517333984375, "learning_rate": 1e-06, "loss": 0.0301, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 1741.7657470703125, "completions/mean_terminated_length": 1092.1162109375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.0710154206773812, "grad_norm": 0.11477696895599365, "kl": 0.10565948486328125, "learning_rate": 1e-06, "loss": 0.0264, "num_tokens": 47028961.0, "reward": 0.2901785969734192, "reward_std": 0.19238899648189545, "rewards/accuracy_reward/mean": 0.2901785671710968, "rewards/accuracy_reward/std": 0.4543520212173462, "step": 409 }, { "clip_ratio/high_max": 0.004271955938747851, "clip_ratio/high_mean": 0.0009175661712106375, "clip_ratio/low_mean": 0.0009827749222495186, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019003410834557144, "epoch": 0.07118905251277821, "grad_norm": 0.09694434702396393, "kl": 0.09929656982421875, "learning_rate": 1e-06, "loss": 0.0264, "step": 410 }, { "clip_ratio/high_max": 0.004276532174117165, "clip_ratio/high_mean": 0.0009531798555144633, "clip_ratio/low_mean": 0.0010952282040079808, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002048408080554509, "epoch": 0.07136268434817523, "grad_norm": 0.09413027763366699, "kl": 0.0980682373046875, "learning_rate": 1e-06, "loss": 0.0263, "step": 411 }, { "clip_ratio/high_max": 0.005869147196790436, "clip_ratio/high_mean": 0.0012473878798573423, "clip_ratio/low_mean": 0.0010801615374020912, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023275494363588223, "epoch": 0.07153631618357226, "grad_norm": 0.09186019003391266, "kl": 0.09481048583984375, "learning_rate": 1e-06, "loss": 0.026, "step": 412 }, { "clip_ratio/high_max": 0.0057525066840753425, "clip_ratio/high_mean": 0.00122930605084548, "clip_ratio/low_mean": 0.0015668084481603728, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002796114451939502, "epoch": 0.07170994801896928, "grad_norm": 0.08405662328004837, "kl": 0.0983123779296875, "learning_rate": 1e-06, "loss": 0.0258, "step": 413 }, { "clip_ratio/high_max": 0.007291765967238462, "clip_ratio/high_mean": 0.0015002406180428807, "clip_ratio/low_mean": 0.0019371726907593256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003437413268329692, "epoch": 0.0718835798543663, "grad_norm": 0.08065783977508545, "kl": 0.1011962890625, "learning_rate": 1e-06, "loss": 0.0256, "step": 414 }, { "clip_ratio/high_max": 0.009932240900525358, "clip_ratio/high_mean": 0.0019328219163980975, "clip_ratio/low_mean": 0.002320596571735223, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004253418488588068, "epoch": 0.07205721168976331, "grad_norm": 0.0790739506483078, "kl": 0.1037445068359375, "learning_rate": 1e-06, "loss": 0.0253, "step": 415 }, { "clip_ratio/high_max": 0.011493629321194021, "clip_ratio/high_mean": 0.0022945110522414325, "clip_ratio/low_mean": 0.002650745253504283, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004945256366227113, "epoch": 0.07223084352516033, "grad_norm": 0.07199160754680634, "kl": 0.10162353515625, "learning_rate": 1e-06, "loss": 0.025, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 1584.5848388671875, "completions/mean_terminated_length": 1064.885498046875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.07240447536055736, "grad_norm": 0.11905412375926971, "kl": 0.08866119384765625, "learning_rate": 1e-06, "loss": 0.0375, "num_tokens": 47811095.0, "reward": 0.392857164144516, "reward_std": 0.2636214792728424, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "step": 417 }, { "clip_ratio/high_max": 0.004611975258740131, "clip_ratio/high_mean": 0.0012729928953376657, "clip_ratio/low_mean": 0.001154468882305082, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024274618153867777, "epoch": 0.07257810719595438, "grad_norm": 0.11176765710115433, "kl": 0.087310791015625, "learning_rate": 1e-06, "loss": 0.0376, "step": 418 }, { "clip_ratio/high_max": 0.005396906763053266, "clip_ratio/high_mean": 0.0014751640201211558, "clip_ratio/low_mean": 0.0012941639852215303, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027693280353560112, "epoch": 0.0727517390313514, "grad_norm": 0.11074753105640411, "kl": 0.0876312255859375, "learning_rate": 1e-06, "loss": 0.0373, "step": 419 }, { "clip_ratio/high_max": 0.005739731954236049, "clip_ratio/high_mean": 0.0015824776533008844, "clip_ratio/low_mean": 0.001494200732850004, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030766783784201834, "epoch": 0.07292537086674841, "grad_norm": 0.09918351471424103, "kl": 0.08624267578125, "learning_rate": 1e-06, "loss": 0.0371, "step": 420 }, { "clip_ratio/high_max": 0.007352237676968798, "clip_ratio/high_mean": 0.0019524257004377432, "clip_ratio/low_mean": 0.002012862743868027, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003965288426115876, "epoch": 0.07309900270214544, "grad_norm": 0.09778185188770294, "kl": 0.0861968994140625, "learning_rate": 1e-06, "loss": 0.0367, "step": 421 }, { "clip_ratio/high_max": 0.007595412316732109, "clip_ratio/high_mean": 0.002090423967274546, "clip_ratio/low_mean": 0.0024920714040490566, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004582495312206447, "epoch": 0.07327263453754246, "grad_norm": 0.09883225709199905, "kl": 0.09046173095703125, "learning_rate": 1e-06, "loss": 0.0365, "step": 422 }, { "clip_ratio/high_max": 0.008931566939281765, "clip_ratio/high_mean": 0.0024086697721941164, "clip_ratio/low_mean": 0.0031704077664471697, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0055790776168578304, "epoch": 0.07344626637293948, "grad_norm": 0.09345990419387817, "kl": 0.0904998779296875, "learning_rate": 1e-06, "loss": 0.0361, "step": 423 }, { "clip_ratio/high_max": 0.01068789974306128, "clip_ratio/high_mean": 0.002820230355609965, "clip_ratio/low_mean": 0.003601876233005896, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006422106642276049, "epoch": 0.0736198982083365, "grad_norm": 0.08698969334363937, "kl": 0.0921173095703125, "learning_rate": 1e-06, "loss": 0.0357, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3013392857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 1698.71435546875, "completions/mean_terminated_length": 1106.402587890625, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.07379353004373351, "grad_norm": 0.2125641405582428, "kl": 0.15181732177734375, "learning_rate": 1e-06, "loss": 0.0208, "num_tokens": 48636719.0, "reward": 0.3727678656578064, "reward_std": 0.20786669850349426, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "step": 425 }, { "clip_ratio/high_max": 0.006638437516812701, "clip_ratio/high_mean": 0.0016623922001599567, "clip_ratio/low_mean": 0.0008128262752506998, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024752185036049923, "epoch": 0.07396716187913054, "grad_norm": 9.738133430480957, "kl": 0.08911895751953125, "learning_rate": 1e-06, "loss": 0.0235, "step": 426 }, { "clip_ratio/high_max": 0.006371053787006531, "clip_ratio/high_mean": 0.0016195726029764046, "clip_ratio/low_mean": 0.0008936036165323458, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025131761613010895, "epoch": 0.07414079371452756, "grad_norm": 2287.505859375, "kl": 1218.634147644043, "learning_rate": 1e-06, "loss": 1.2423, "step": 427 }, { "clip_ratio/high_max": 0.008349580202775542, "clip_ratio/high_mean": 0.0020915290542689036, "clip_ratio/low_mean": 0.0009262196022064018, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030177486696629785, "epoch": 0.07431442554992458, "grad_norm": 207.8161163330078, "kl": 100.82711029052734, "learning_rate": 1e-06, "loss": 0.1209, "step": 428 }, { "clip_ratio/high_max": 0.011657201626803726, "clip_ratio/high_mean": 0.00282005640292482, "clip_ratio/low_mean": 0.0010234229644083825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003843479422357632, "epoch": 0.0744880573853216, "grad_norm": 1.5711063146591187, "kl": 0.0734100341796875, "learning_rate": 1e-06, "loss": 0.0212, "step": 429 }, { "clip_ratio/high_max": 0.012321706642978825, "clip_ratio/high_mean": 0.0030305155833048047, "clip_ratio/low_mean": 0.0011596299996199377, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004190145611573826, "epoch": 0.07466168922071861, "grad_norm": 1.2784291505813599, "kl": 0.073822021484375, "learning_rate": 1e-06, "loss": 0.0212, "step": 430 }, { "clip_ratio/high_max": 0.012161142818513326, "clip_ratio/high_mean": 0.003064197640924249, "clip_ratio/low_mean": 0.0012884166926596663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004352614230811014, "epoch": 0.07483532105611564, "grad_norm": 0.8636384606361389, "kl": 0.6247711181640625, "learning_rate": 1e-06, "loss": 0.0209, "step": 431 }, { "clip_ratio/high_max": 0.012331389778410085, "clip_ratio/high_mean": 0.0030326940159284277, "clip_ratio/low_mean": 0.0014862754787827726, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004518969519267557, "epoch": 0.07500895289151266, "grad_norm": 0.5960454940795898, "kl": 0.5354232788085938, "learning_rate": 1e-06, "loss": 0.0206, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2544642857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3063.0, "completions/mean_length": 1586.0469970703125, "completions/mean_terminated_length": 1078.8653564453125, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.07518258472690968, "grad_norm": 0.418211966753006, "kl": 0.27220916748046875, "learning_rate": 1e-06, "loss": 0.0301, "num_tokens": 49409220.0, "reward": 0.4620535969734192, "reward_std": 0.2193603515625, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "step": 433 }, { "clip_ratio/high_max": 0.005735861777793616, "clip_ratio/high_mean": 0.0014597439567296533, "clip_ratio/low_mean": 0.0007651987748431566, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022249427493079565, "epoch": 0.0753562165623067, "grad_norm": 0.1642620861530304, "kl": 0.13524627685546875, "learning_rate": 1e-06, "loss": 0.0299, "step": 434 }, { "clip_ratio/high_max": 0.006147520805825479, "clip_ratio/high_mean": 0.001640935943896693, "clip_ratio/low_mean": 0.0010193331872869749, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026602691705193138, "epoch": 0.07552984839770371, "grad_norm": 0.10653311759233475, "kl": 0.08786773681640625, "learning_rate": 1e-06, "loss": 0.0298, "step": 435 }, { "clip_ratio/high_max": 0.0077790701361664105, "clip_ratio/high_mean": 0.0020281250976950105, "clip_ratio/low_mean": 0.001254971357411705, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003283096448285505, "epoch": 0.07570348023310074, "grad_norm": 0.09984457492828369, "kl": 0.084808349609375, "learning_rate": 1e-06, "loss": 0.0296, "step": 436 }, { "clip_ratio/high_max": 0.009569389068929013, "clip_ratio/high_mean": 0.0023543717138636566, "clip_ratio/low_mean": 0.001395418571519258, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037497902903851354, "epoch": 0.07587711206849776, "grad_norm": 0.10475429892539978, "kl": 0.08353424072265625, "learning_rate": 1e-06, "loss": 0.0294, "step": 437 }, { "clip_ratio/high_max": 0.010518163689994253, "clip_ratio/high_mean": 0.002477291108789359, "clip_ratio/low_mean": 0.0014950158956708037, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003972306951254723, "epoch": 0.07605074390389478, "grad_norm": 0.08360475301742554, "kl": 0.08065032958984375, "learning_rate": 1e-06, "loss": 0.0291, "step": 438 }, { "clip_ratio/high_max": 0.012033404167596018, "clip_ratio/high_mean": 0.002867447743710727, "clip_ratio/low_mean": 0.0016070780266090878, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004474525641853688, "epoch": 0.0762243757392918, "grad_norm": 0.07997187972068787, "kl": 0.0775299072265625, "learning_rate": 1e-06, "loss": 0.0289, "step": 439 }, { "clip_ratio/high_max": 0.014229444739612518, "clip_ratio/high_mean": 0.0032306189434621047, "clip_ratio/low_mean": 0.0016241916118815425, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004854810580582125, "epoch": 0.07639800757468881, "grad_norm": 0.08162183314561844, "kl": 0.07611846923828125, "learning_rate": 1e-06, "loss": 0.0287, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3258928571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 1752.810302734375, "completions/mean_terminated_length": 1115.0562744140625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.07657163941008584, "grad_norm": 0.3379739224910736, "kl": 0.1741485595703125, "learning_rate": 1e-06, "loss": 0.0646, "num_tokens": 50256751.0, "reward": 0.3437500298023224, "reward_std": 0.2726394832134247, "rewards/accuracy_reward/mean": 0.34375, "rewards/accuracy_reward/std": 0.47548985481262207, "step": 441 }, { "clip_ratio/high_max": 0.006407802098692628, "clip_ratio/high_mean": 0.001699759087387065, "clip_ratio/low_mean": 0.0008698732594893954, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025696323355077766, "epoch": 0.07674527124548286, "grad_norm": 0.14732539653778076, "kl": 0.08139801025390625, "learning_rate": 1e-06, "loss": 0.0646, "step": 442 }, { "clip_ratio/high_max": 0.006559279543580487, "clip_ratio/high_mean": 0.0016686586409377924, "clip_ratio/low_mean": 0.0010541747747083718, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002722833407460712, "epoch": 0.07691890308087988, "grad_norm": 19.248817443847656, "kl": 0.08948516845703125, "learning_rate": 1e-06, "loss": 0.0678, "step": 443 }, { "clip_ratio/high_max": 0.007590041175717488, "clip_ratio/high_mean": 0.0020089546010240156, "clip_ratio/low_mean": 0.001223362269684003, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032323168670700397, "epoch": 0.0770925349162769, "grad_norm": 0.7008788585662842, "kl": 0.305389404296875, "learning_rate": 1e-06, "loss": 0.0645, "step": 444 }, { "clip_ratio/high_max": 0.008124701969791204, "clip_ratio/high_mean": 0.0021239983057057543, "clip_ratio/low_mean": 0.0015344150897362852, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003658413432276575, "epoch": 0.07726616675167391, "grad_norm": 0.5181043148040771, "kl": 0.26507568359375, "learning_rate": 1e-06, "loss": 0.0641, "step": 445 }, { "clip_ratio/high_max": 0.009901195218844805, "clip_ratio/high_mean": 0.002591530489098659, "clip_ratio/low_mean": 0.0016299107492159237, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004221441216941457, "epoch": 0.07743979858707094, "grad_norm": 0.1227397695183754, "kl": 0.08728790283203125, "learning_rate": 1e-06, "loss": 0.0637, "step": 446 }, { "clip_ratio/high_max": 0.01269936981407227, "clip_ratio/high_mean": 0.0031316368713305565, "clip_ratio/low_mean": 0.001882031245258986, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005013668167521246, "epoch": 0.07761343042246796, "grad_norm": 7.099063396453857, "kl": 0.0874786376953125, "learning_rate": 1e-06, "loss": 0.065, "step": 447 }, { "clip_ratio/high_max": 0.013003014224523213, "clip_ratio/high_mean": 0.0032608748479105998, "clip_ratio/low_mean": 0.002270696876621514, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005531571667233948, "epoch": 0.07778706225786498, "grad_norm": 0.09852556884288788, "kl": 0.095703125, "learning_rate": 1e-06, "loss": 0.0631, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2723214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1664.8616943359375, "completions/mean_terminated_length": 1138.2637939453125, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "epoch": 0.077960694093262, "grad_norm": 2.9730663299560547, "kl": 1.12530517578125, "learning_rate": 1e-06, "loss": 0.0336, "num_tokens": 51072273.0, "reward": 0.3125, "reward_std": 0.2823428809642792, "rewards/accuracy_reward/mean": 0.3125, "rewards/accuracy_reward/std": 0.46403056383132935, "step": 449 }, { "clip_ratio/high_max": 0.006336476595606655, "clip_ratio/high_mean": 0.001634679898415925, "clip_ratio/low_mean": 0.0009602391651242215, "clip_ratio/low_min": 2.5431314497836865e-05, "clip_ratio/region_mean": 0.002594919073089841, "epoch": 0.07813432592865902, "grad_norm": 47.77210235595703, "kl": 0.08438873291015625, "learning_rate": 1e-06, "loss": 0.7841, "step": 450 }, { "clip_ratio/high_max": 0.00672393964487128, "clip_ratio/high_mean": 0.001788935236618272, "clip_ratio/low_mean": 0.0009755866403793334, "clip_ratio/low_min": 2.6194467864115722e-05, "clip_ratio/region_mean": 0.0027645218942780048, "epoch": 0.07830795776405604, "grad_norm": 77.50012969970703, "kl": 0.08161163330078125, "learning_rate": 1e-06, "loss": 0.7841, "step": 451 }, { "clip_ratio/high_max": 0.007176872903073672, "clip_ratio/high_mean": 0.0019190134071322973, "clip_ratio/low_mean": 0.0011397637385925918, "clip_ratio/low_min": 5.086262899567373e-05, "clip_ratio/region_mean": 0.0030587771852879087, "epoch": 0.07848158959945306, "grad_norm": 0.1325000524520874, "kl": 0.12021636962890625, "learning_rate": 1e-06, "loss": 0.0328, "step": 452 }, { "clip_ratio/high_max": 0.007836245007638354, "clip_ratio/high_mean": 0.002024302456447913, "clip_ratio/low_mean": 0.001299340309742547, "clip_ratio/low_min": 3.560383993317373e-05, "clip_ratio/region_mean": 0.0033236428262171103, "epoch": 0.07865522143485008, "grad_norm": 0.1208128109574318, "kl": 0.1126861572265625, "learning_rate": 1e-06, "loss": 0.0326, "step": 453 }, { "clip_ratio/high_max": 0.008698377299879212, "clip_ratio/high_mean": 0.002184918552302406, "clip_ratio/low_mean": 0.0014153072852423065, "clip_ratio/low_min": 2.6194467864115722e-05, "clip_ratio/region_mean": 0.003600225849368144, "epoch": 0.07882885327024709, "grad_norm": 0.21987178921699524, "kl": 0.08583831787109375, "learning_rate": 1e-06, "loss": 0.0325, "step": 454 }, { "clip_ratio/high_max": 0.0086220077573671, "clip_ratio/high_mean": 0.0021240051983113517, "clip_ratio/low_mean": 0.0017332685629298794, "clip_ratio/low_min": 4.7091620217543095e-05, "clip_ratio/region_mean": 0.0038572737630602205, "epoch": 0.07900248510564412, "grad_norm": 0.1388324350118637, "kl": 0.1075592041015625, "learning_rate": 1e-06, "loss": 0.0321, "step": 455 }, { "clip_ratio/high_max": 0.008979543381428812, "clip_ratio/high_mean": 0.0021915077149969875, "clip_ratio/low_mean": 0.0020834725164604606, "clip_ratio/low_min": 8.138021075865254e-05, "clip_ratio/region_mean": 0.004274980221453006, "epoch": 0.07917611694104114, "grad_norm": 0.15565025806427002, "kl": 0.12303924560546875, "learning_rate": 1e-06, "loss": 0.0319, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3035714285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 1698.5045166015625, "completions/mean_terminated_length": 1099.80126953125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.07934974877643816, "grad_norm": 0.10994289815425873, "kl": 0.08245086669921875, "learning_rate": 1e-06, "loss": 0.0239, "num_tokens": 51900219.0, "reward": 0.3750000298023224, "reward_std": 0.2389710545539856, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "step": 457 }, { "clip_ratio/high_max": 0.004674215866543818, "clip_ratio/high_mean": 0.0011440376870268665, "clip_ratio/low_mean": 0.0011321632994167885, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022762009757570922, "epoch": 0.07952338061183518, "grad_norm": 0.10395507514476776, "kl": 0.08879852294921875, "learning_rate": 1e-06, "loss": 0.024, "step": 458 }, { "clip_ratio/high_max": 0.005158200565347215, "clip_ratio/high_mean": 0.0012375456003610452, "clip_ratio/low_mean": 0.0013311869088283856, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002568732516010641, "epoch": 0.0796970124472322, "grad_norm": 0.10156629979610443, "kl": 0.0923919677734375, "learning_rate": 1e-06, "loss": 0.0238, "step": 459 }, { "clip_ratio/high_max": 0.006042232813342707, "clip_ratio/high_mean": 0.0014752502970623027, "clip_ratio/low_mean": 0.0016327071098203305, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003107957436441211, "epoch": 0.07987064428262922, "grad_norm": 0.10153991729021072, "kl": 0.096405029296875, "learning_rate": 1e-06, "loss": 0.0236, "step": 460 }, { "clip_ratio/high_max": 0.007450527620676439, "clip_ratio/high_mean": 0.001754120652549318, "clip_ratio/low_mean": 0.0016041682483773911, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033582888663659105, "epoch": 0.08004427611802624, "grad_norm": 0.0974503830075264, "kl": 0.09368896484375, "learning_rate": 1e-06, "loss": 0.0234, "step": 461 }, { "clip_ratio/high_max": 0.008224559933296405, "clip_ratio/high_mean": 0.0020154296912551217, "clip_ratio/low_mean": 0.0018496911243346403, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003865120841510361, "epoch": 0.08021790795342326, "grad_norm": 0.09553153812885284, "kl": 0.0942535400390625, "learning_rate": 1e-06, "loss": 0.0231, "step": 462 }, { "clip_ratio/high_max": 0.01015484738672967, "clip_ratio/high_mean": 0.002380861359597475, "clip_ratio/low_mean": 0.002190757253629272, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004571618697809754, "epoch": 0.08039153978882028, "grad_norm": 0.09168437868356705, "kl": 0.0938262939453125, "learning_rate": 1e-06, "loss": 0.0228, "step": 463 }, { "clip_ratio/high_max": 0.010805876227095723, "clip_ratio/high_mean": 0.002578993500719662, "clip_ratio/low_mean": 0.002579962179879658, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005158955689694267, "epoch": 0.0805651716242173, "grad_norm": 0.08476903289556503, "kl": 0.09316253662109375, "learning_rate": 1e-06, "loss": 0.0225, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2678571428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 1526.6094970703125, "completions/mean_terminated_length": 961.2225341796875, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.08073880345961432, "grad_norm": 0.12020517140626907, "kl": 0.09127044677734375, "learning_rate": 1e-06, "loss": 0.0554, "num_tokens": 52645708.0, "reward": 0.4441964626312256, "reward_std": 0.23641487956047058, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316656589508, "step": 465 }, { "clip_ratio/high_max": 0.005100770249555353, "clip_ratio/high_mean": 0.0014752637930541823, "clip_ratio/low_mean": 0.0009166427689706325, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002391906566117541, "epoch": 0.08091243529501134, "grad_norm": 0.11137521266937256, "kl": 0.0794219970703125, "learning_rate": 1e-06, "loss": 0.0554, "step": 466 }, { "clip_ratio/high_max": 0.005809335551020922, "clip_ratio/high_mean": 0.0016828934080876934, "clip_ratio/low_mean": 0.0009414197388650791, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026243131251248997, "epoch": 0.08108606713040836, "grad_norm": 0.10621678829193115, "kl": 0.08104705810546875, "learning_rate": 1e-06, "loss": 0.0552, "step": 467 }, { "clip_ratio/high_max": 0.006541128401295282, "clip_ratio/high_mean": 0.0017528621310702874, "clip_ratio/low_mean": 0.0012700571915047476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030229193189370562, "epoch": 0.08125969896580539, "grad_norm": 0.09849829226732254, "kl": 0.08394622802734375, "learning_rate": 1e-06, "loss": 0.0549, "step": 468 }, { "clip_ratio/high_max": 0.0077413789949787315, "clip_ratio/high_mean": 0.0021187207312323153, "clip_ratio/low_mean": 0.0012644958687815233, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033832166336651426, "epoch": 0.0814333308012024, "grad_norm": 0.09577888250350952, "kl": 0.0832672119140625, "learning_rate": 1e-06, "loss": 0.0547, "step": 469 }, { "clip_ratio/high_max": 0.0077891499422548804, "clip_ratio/high_mean": 0.002119074771144369, "clip_ratio/low_mean": 0.00189149139168876, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0040105661555571714, "epoch": 0.08160696263659942, "grad_norm": 0.09020861983299255, "kl": 0.0886383056640625, "learning_rate": 1e-06, "loss": 0.0545, "step": 470 }, { "clip_ratio/high_max": 0.009254689553927165, "clip_ratio/high_mean": 0.00246508059262851, "clip_ratio/low_mean": 0.002369495250604814, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004834575880522607, "epoch": 0.08178059447199644, "grad_norm": 0.09162262082099915, "kl": 0.09090423583984375, "learning_rate": 1e-06, "loss": 0.0542, "step": 471 }, { "clip_ratio/high_max": 0.010556171917414758, "clip_ratio/high_mean": 0.002823393549078901, "clip_ratio/low_mean": 0.0025055591731870663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00532895268406719, "epoch": 0.08195422630739346, "grad_norm": 0.08201772719621658, "kl": 0.08930206298828125, "learning_rate": 1e-06, "loss": 0.0539, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 3072.0, "completions/max_terminated_length": 2982.0, "completions/mean_length": 1838.13623046875, "completions/mean_terminated_length": 1097.81787109375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.08212785814279049, "grad_norm": 0.11682763695716858, "kl": 0.111297607421875, "learning_rate": 1e-06, "loss": 0.0447, "num_tokens": 53533513.0, "reward": 0.3638392984867096, "reward_std": 0.23415900766849518, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "step": 473 }, { "clip_ratio/high_max": 0.0049520835709699895, "clip_ratio/high_mean": 0.0012133184586673451, "clip_ratio/low_mean": 0.0008120185839288752, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020253370530554093, "epoch": 0.0823014899781875, "grad_norm": 0.10430175811052322, "kl": 0.102783203125, "learning_rate": 1e-06, "loss": 0.0447, "step": 474 }, { "clip_ratio/high_max": 0.005439738910354208, "clip_ratio/high_mean": 0.0013660619461006718, "clip_ratio/low_mean": 0.0008285818280455715, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002194643768689275, "epoch": 0.08247512181358452, "grad_norm": 0.6252321004867554, "kl": 0.0897216796875, "learning_rate": 1e-06, "loss": 0.0447, "step": 475 }, { "clip_ratio/high_max": 0.006120621343143284, "clip_ratio/high_mean": 0.0015218980088320677, "clip_ratio/low_mean": 0.0008989926045614993, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002420890659777797, "epoch": 0.08264875364898154, "grad_norm": 415.9425048828125, "kl": 167.09544372558594, "learning_rate": 1e-06, "loss": 0.2111, "step": 476 }, { "clip_ratio/high_max": 0.007378526846878231, "clip_ratio/high_mean": 0.0017959698866434337, "clip_ratio/low_mean": 0.001056250248439028, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028522201519081136, "epoch": 0.08282238548437856, "grad_norm": 0.19899268448352814, "kl": 0.14093017578125, "learning_rate": 1e-06, "loss": 0.0443, "step": 477 }, { "clip_ratio/high_max": 0.00810518152138684, "clip_ratio/high_mean": 0.0019798160774371354, "clip_ratio/low_mean": 0.0013497115460268105, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033295276707576704, "epoch": 0.08299601731977559, "grad_norm": 4.485450744628906, "kl": 0.08962249755859375, "learning_rate": 1e-06, "loss": 0.0497, "step": 478 }, { "clip_ratio/high_max": 0.009495730242633726, "clip_ratio/high_mean": 0.002250232181722822, "clip_ratio/low_mean": 0.0014664879327028757, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037167201207921607, "epoch": 0.0831696491551726, "grad_norm": 0.1409304440021515, "kl": 0.0923614501953125, "learning_rate": 1e-06, "loss": 0.0441, "step": 479 }, { "clip_ratio/high_max": 0.009775455167982727, "clip_ratio/high_mean": 0.0023050628442433663, "clip_ratio/low_mean": 0.0016136183712660568, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0039186812173284125, "epoch": 0.08334328099056962, "grad_norm": 0.3068983256816864, "kl": 0.20275115966796875, "learning_rate": 1e-06, "loss": 0.044, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2723214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 1545.7388916015625, "completions/mean_terminated_length": 974.5613403320312, "completions/min_length": 230.0, "completions/min_terminated_length": 230.0, "epoch": 0.08351691282596664, "grad_norm": 0.10627832263708115, "kl": 0.08594512939453125, "learning_rate": 1e-06, "loss": 0.0276, "num_tokens": 54287308.0, "reward": 0.4040178656578064, "reward_std": 0.21207119524478912, "rewards/accuracy_reward/mean": 0.4040178656578064, "rewards/accuracy_reward/std": 0.49124953150749207, "step": 481 }, { "clip_ratio/high_max": 0.0043158450062037446, "clip_ratio/high_mean": 0.0010746567681962915, "clip_ratio/low_mean": 0.000863735755046946, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001938392499141628, "epoch": 0.08369054466136366, "grad_norm": 0.0987161323428154, "kl": 0.08782958984375, "learning_rate": 1e-06, "loss": 0.0276, "step": 482 }, { "clip_ratio/high_max": 0.0044860876514576375, "clip_ratio/high_mean": 0.0011203482395103492, "clip_ratio/low_mean": 0.0010565754291746998, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021769236809632275, "epoch": 0.08386417649676069, "grad_norm": 0.09091802686452866, "kl": 0.0899658203125, "learning_rate": 1e-06, "loss": 0.0275, "step": 483 }, { "clip_ratio/high_max": 0.005559797737078043, "clip_ratio/high_mean": 0.0013125105474500742, "clip_ratio/low_mean": 0.0011788479987444589, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002491358554834733, "epoch": 0.0840378083321577, "grad_norm": 0.0922025740146637, "kl": 0.0972747802734375, "learning_rate": 1e-06, "loss": 0.0273, "step": 484 }, { "clip_ratio/high_max": 0.005782710748462705, "clip_ratio/high_mean": 0.0013721469531446928, "clip_ratio/low_mean": 0.0014205987226887373, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027927457049372606, "epoch": 0.08421144016755472, "grad_norm": 0.0885249525308609, "kl": 0.09940338134765625, "learning_rate": 1e-06, "loss": 0.0271, "step": 485 }, { "clip_ratio/high_max": 0.006820173006417463, "clip_ratio/high_mean": 0.0016479617006552871, "clip_ratio/low_mean": 0.001538278566840745, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031862402574915905, "epoch": 0.08438507200295174, "grad_norm": 0.08178985863924026, "kl": 0.09528350830078125, "learning_rate": 1e-06, "loss": 0.0269, "step": 486 }, { "clip_ratio/high_max": 0.007631264175870456, "clip_ratio/high_mean": 0.0017653276581768296, "clip_ratio/low_mean": 0.0019565660013540764, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037218936886347365, "epoch": 0.08455870383834876, "grad_norm": 0.07949966192245483, "kl": 0.0980377197265625, "learning_rate": 1e-06, "loss": 0.0267, "step": 487 }, { "clip_ratio/high_max": 0.009345918479084503, "clip_ratio/high_mean": 0.0021588014487861074, "clip_ratio/low_mean": 0.002299351075635059, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004458152547158534, "epoch": 0.08473233567374579, "grad_norm": 0.07672140747308731, "kl": 0.09564971923828125, "learning_rate": 1e-06, "loss": 0.0265, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2745535714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3070.0, "completions/mean_length": 1654.607177734375, "completions/mean_terminated_length": 1118.178466796875, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.0849059675091428, "grad_norm": 0.13016219437122345, "kl": 0.1219329833984375, "learning_rate": 1e-06, "loss": 0.03, "num_tokens": 55092844.0, "reward": 0.4174107313156128, "reward_std": 0.265658438205719, "rewards/accuracy_reward/mean": 0.4174107015132904, "rewards/accuracy_reward/std": 0.4936830997467041, "step": 489 }, { "clip_ratio/high_max": 0.005037302369601093, "clip_ratio/high_mean": 0.0013612103175546508, "clip_ratio/low_mean": 0.001115944725825102, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024771550197328907, "epoch": 0.08507959934453982, "grad_norm": 0.11580182611942291, "kl": 0.12087249755859375, "learning_rate": 1e-06, "loss": 0.03, "step": 490 }, { "clip_ratio/high_max": 0.005896466478588991, "clip_ratio/high_mean": 0.001530261129119026, "clip_ratio/low_mean": 0.0011982151609117864, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027284762472845614, "epoch": 0.08525323117993684, "grad_norm": 0.11087345331907272, "kl": 0.12619781494140625, "learning_rate": 1e-06, "loss": 0.0297, "step": 491 }, { "clip_ratio/high_max": 0.007079326445818879, "clip_ratio/high_mean": 0.0018123106947314227, "clip_ratio/low_mean": 0.0013768856115348171, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003189196311723208, "epoch": 0.08542686301533386, "grad_norm": 0.10911300033330917, "kl": 0.12189483642578125, "learning_rate": 1e-06, "loss": 0.0295, "step": 492 }, { "clip_ratio/high_max": 0.00778413097577868, "clip_ratio/high_mean": 0.001988353234992246, "clip_ratio/low_mean": 0.0015709733252151636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035593266002251767, "epoch": 0.08560049485073089, "grad_norm": 0.0998978465795517, "kl": 0.1194305419921875, "learning_rate": 1e-06, "loss": 0.0292, "step": 493 }, { "clip_ratio/high_max": 0.010071886827063281, "clip_ratio/high_mean": 0.002414206528555951, "clip_ratio/low_mean": 0.0018107859013980487, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004224992400850169, "epoch": 0.0857741266861279, "grad_norm": 0.09557017683982849, "kl": 0.11234283447265625, "learning_rate": 1e-06, "loss": 0.0289, "step": 494 }, { "clip_ratio/high_max": 0.011388291612092871, "clip_ratio/high_mean": 0.0027134665733683505, "clip_ratio/low_mean": 0.0021317709833965637, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004845237606787123, "epoch": 0.08594775852152492, "grad_norm": 0.0904911458492279, "kl": 0.107879638671875, "learning_rate": 1e-06, "loss": 0.0286, "step": 495 }, { "clip_ratio/high_max": 0.01344329148560064, "clip_ratio/high_mean": 0.003183992059348384, "clip_ratio/low_mean": 0.002437844041196513, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005621835975034628, "epoch": 0.08612139035692194, "grad_norm": 0.09667691588401794, "kl": 0.110595703125, "learning_rate": 1e-06, "loss": 0.0283, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2790178571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 1623.372802734375, "completions/mean_terminated_length": 1062.758544921875, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.08629502219231897, "grad_norm": 0.11141736060380936, "kl": 0.08403778076171875, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 55879123.0, "reward": 0.4531250298023224, "reward_std": 0.28195062279701233, "rewards/accuracy_reward/mean": 0.453125, "rewards/accuracy_reward/std": 0.49835437536239624, "step": 497 }, { "clip_ratio/high_max": 0.005493196515089949, "clip_ratio/high_mean": 0.0015729493152321083, "clip_ratio/low_mean": 0.0008635872861759708, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024365365879930323, "epoch": 0.08646865402771599, "grad_norm": 0.1065514087677002, "kl": 0.08658599853515625, "learning_rate": 1e-06, "loss": 0.0278, "step": 498 }, { "clip_ratio/high_max": 0.005504368376932689, "clip_ratio/high_mean": 0.001640295524339308, "clip_ratio/low_mean": 0.0009710071265089937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002611302623336087, "epoch": 0.086642285863113, "grad_norm": 0.10003402829170227, "kl": 0.086029052734375, "learning_rate": 1e-06, "loss": 0.0276, "step": 499 }, { "clip_ratio/high_max": 0.006330421643724549, "clip_ratio/high_mean": 0.001867873543233145, "clip_ratio/low_mean": 0.0010714344416555832, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029393079803412547, "epoch": 0.08681591769851002, "grad_norm": 0.09706158936023712, "kl": 0.0877685546875, "learning_rate": 1e-06, "loss": 0.0273, "step": 500 }, { "clip_ratio/high_max": 0.007038537221887964, "clip_ratio/high_mean": 0.0019724742178368615, "clip_ratio/low_mean": 0.0013705928336094075, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033430670882808045, "epoch": 0.08698954953390704, "grad_norm": 0.09465965628623962, "kl": 0.0891265869140625, "learning_rate": 1e-06, "loss": 0.0271, "step": 501 }, { "clip_ratio/high_max": 0.008035640048547066, "clip_ratio/high_mean": 0.00220502464526362, "clip_ratio/low_mean": 0.0016511132162122522, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038561378278245684, "epoch": 0.08716318136930407, "grad_norm": 0.09119747579097748, "kl": 0.08943939208984375, "learning_rate": 1e-06, "loss": 0.0268, "step": 502 }, { "clip_ratio/high_max": 0.008945396835770225, "clip_ratio/high_mean": 0.0025401014945600764, "clip_ratio/low_mean": 0.0018918308032880304, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0044319322987576015, "epoch": 0.08733681320470109, "grad_norm": 0.08658075332641602, "kl": 0.088775634765625, "learning_rate": 1e-06, "loss": 0.0265, "step": 503 }, { "clip_ratio/high_max": 0.011121502091555158, "clip_ratio/high_mean": 0.0031705754099675687, "clip_ratio/low_mean": 0.0020323716298662475, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005202946887948201, "epoch": 0.0875104450400981, "grad_norm": 0.08272993564605713, "kl": 0.08681488037109375, "learning_rate": 1e-06, "loss": 0.0262, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2633928571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 1635.977783203125, "completions/mean_terminated_length": 1122.4908447265625, "completions/min_length": 217.0, "completions/min_terminated_length": 217.0, "epoch": 0.08768407687549512, "grad_norm": 0.6444329619407654, "kl": 0.443206787109375, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 56676177.0, "reward": 0.345982164144516, "reward_std": 0.22207409143447876, "rewards/accuracy_reward/mean": 0.3459821343421936, "rewards/accuracy_reward/std": 0.47621920704841614, "step": 505 }, { "clip_ratio/high_max": 0.005364278011256829, "clip_ratio/high_mean": 0.0012582704821397783, "clip_ratio/low_mean": 0.0008958855391938414, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021541560136029148, "epoch": 0.08785770871089214, "grad_norm": 0.11086387932300568, "kl": 0.09932708740234375, "learning_rate": 1e-06, "loss": 0.032, "step": 506 }, { "clip_ratio/high_max": 0.005748780051362701, "clip_ratio/high_mean": 0.0013644831415149383, "clip_ratio/low_mean": 0.0009025221011143003, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022670052712783217, "epoch": 0.08803134054628917, "grad_norm": 0.10319437831640244, "kl": 0.09151458740234375, "learning_rate": 1e-06, "loss": 0.0318, "step": 507 }, { "clip_ratio/high_max": 0.0064736648309917655, "clip_ratio/high_mean": 0.0015213671504170634, "clip_ratio/low_mean": 0.001126096072312066, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026474632186364033, "epoch": 0.08820497238168618, "grad_norm": 0.10298606753349304, "kl": 0.09423065185546875, "learning_rate": 1e-06, "loss": 0.0316, "step": 508 }, { "clip_ratio/high_max": 0.008642890468763653, "clip_ratio/high_mean": 0.001972393028154329, "clip_ratio/low_mean": 0.0012116891980440414, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031840822812228, "epoch": 0.0883786042170832, "grad_norm": 0.10111413896083832, "kl": 0.09131622314453125, "learning_rate": 1e-06, "loss": 0.0314, "step": 509 }, { "clip_ratio/high_max": 0.010075741687614936, "clip_ratio/high_mean": 0.002266773785777332, "clip_ratio/low_mean": 0.0013442883264360717, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003611062125855824, "epoch": 0.08855223605248022, "grad_norm": 0.09789565205574036, "kl": 0.092010498046875, "learning_rate": 1e-06, "loss": 0.0311, "step": 510 }, { "clip_ratio/high_max": 0.009856549666437786, "clip_ratio/high_mean": 0.00221199130101013, "clip_ratio/low_mean": 0.0016784163499323768, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038904076463950332, "epoch": 0.08872586788787724, "grad_norm": 0.11163059622049332, "kl": 0.118133544921875, "learning_rate": 1e-06, "loss": 0.0309, "step": 511 }, { "clip_ratio/high_max": 0.011084336030762643, "clip_ratio/high_mean": 0.0024272713817481417, "clip_ratio/low_mean": 0.002102397561429825, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004529669065959752, "epoch": 0.08889949972327427, "grad_norm": 0.10466403514146805, "kl": 0.133819580078125, "learning_rate": 1e-06, "loss": 0.0306, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3571428571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3022.0, "completions/mean_length": 1841.3751220703125, "completions/mean_terminated_length": 1157.6944580078125, "completions/min_length": 221.0, "completions/min_terminated_length": 221.0, "epoch": 0.08907313155867128, "grad_norm": 0.22046247124671936, "kl": 0.28801727294921875, "learning_rate": 1e-06, "loss": 0.0553, "num_tokens": 57565065.0, "reward": 0.3571428656578064, "reward_std": 0.2725575566291809, "rewards/accuracy_reward/mean": 0.3571428656578064, "rewards/accuracy_reward/std": 0.47969308495521545, "step": 513 }, { "clip_ratio/high_max": 0.0068128533093840815, "clip_ratio/high_mean": 0.001861096803622786, "clip_ratio/low_mean": 0.0007714842763562046, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002632581061334349, "epoch": 0.0892467633940683, "grad_norm": 0.15421748161315918, "kl": 0.16094207763671875, "learning_rate": 1e-06, "loss": 0.0554, "step": 514 }, { "clip_ratio/high_max": 0.006747161198290996, "clip_ratio/high_mean": 0.0018508401699364185, "clip_ratio/low_mean": 0.0011556152676348574, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00300645545576117, "epoch": 0.08942039522946532, "grad_norm": 0.12082920223474503, "kl": 0.13782501220703125, "learning_rate": 1e-06, "loss": 0.055, "step": 515 }, { "clip_ratio/high_max": 0.007330865752010141, "clip_ratio/high_mean": 0.0020409696890055784, "clip_ratio/low_mean": 0.0012538712198875146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032948409352684394, "epoch": 0.08959402706486234, "grad_norm": 0.10754095762968063, "kl": 0.11510467529296875, "learning_rate": 1e-06, "loss": 0.0548, "step": 516 }, { "clip_ratio/high_max": 0.008140461082803085, "clip_ratio/high_mean": 0.0022064246913942043, "clip_ratio/low_mean": 0.0017053494511856115, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003911774158041226, "epoch": 0.08976765890025937, "grad_norm": 0.10957161337137222, "kl": 0.13775634765625, "learning_rate": 1e-06, "loss": 0.0546, "step": 517 }, { "clip_ratio/high_max": 0.009768428892130032, "clip_ratio/high_mean": 0.0026083944849233376, "clip_ratio/low_mean": 0.0019726237278518965, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004581018190947361, "epoch": 0.08994129073565638, "grad_norm": 0.15992212295532227, "kl": 0.1421356201171875, "learning_rate": 1e-06, "loss": 0.0543, "step": 518 }, { "clip_ratio/high_max": 0.009824651395319961, "clip_ratio/high_mean": 0.002561258346759132, "clip_ratio/low_mean": 0.0024821007409627782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005043359145929571, "epoch": 0.0901149225710534, "grad_norm": 0.11213034391403198, "kl": 0.15108489990234375, "learning_rate": 1e-06, "loss": 0.054, "step": 519 }, { "clip_ratio/high_max": 0.012448699388187379, "clip_ratio/high_mean": 0.0031975847414287273, "clip_ratio/low_mean": 0.002543515956858755, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005741100809245836, "epoch": 0.09028855440645042, "grad_norm": 0.12178327888250351, "kl": 0.17473602294921875, "learning_rate": 1e-06, "loss": 0.0537, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3950892857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3032.0, "completions/mean_length": 1902.591552734375, "completions/mean_terminated_length": 1138.80810546875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.09046218624184744, "grad_norm": 0.10683908313512802, "kl": 0.1321563720703125, "learning_rate": 1e-06, "loss": 0.0279, "num_tokens": 58485026.0, "reward": 0.3013392984867096, "reward_std": 0.18690434098243713, "rewards/accuracy_reward/mean": 0.3013392984867096, "rewards/accuracy_reward/std": 0.4593527019023895, "step": 521 }, { "clip_ratio/high_max": 0.004161441254836973, "clip_ratio/high_mean": 0.0009243535100722511, "clip_ratio/low_mean": 0.0006814114294684259, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016057649536378449, "epoch": 0.09063581807724447, "grad_norm": 0.09160590916872025, "kl": 0.11431121826171875, "learning_rate": 1e-06, "loss": 0.0279, "step": 522 }, { "clip_ratio/high_max": 0.004887223794867168, "clip_ratio/high_mean": 0.0011157053570514108, "clip_ratio/low_mean": 0.0006971050179345184, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018128103529306827, "epoch": 0.09080944991264148, "grad_norm": 0.08670849353075027, "kl": 0.0935211181640625, "learning_rate": 1e-06, "loss": 0.0278, "step": 523 }, { "clip_ratio/high_max": 0.005632492291624658, "clip_ratio/high_mean": 0.0012323926748649683, "clip_ratio/low_mean": 0.0007966856746861595, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020290783504606225, "epoch": 0.0909830817480385, "grad_norm": 0.08111963421106339, "kl": 0.092742919921875, "learning_rate": 1e-06, "loss": 0.0277, "step": 524 }, { "clip_ratio/high_max": 0.006193608343892265, "clip_ratio/high_mean": 0.0013471346364894998, "clip_ratio/low_mean": 0.000914694109269476, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00226182876394887, "epoch": 0.09115671358343552, "grad_norm": 0.08029177039861679, "kl": 0.0906524658203125, "learning_rate": 1e-06, "loss": 0.0275, "step": 525 }, { "clip_ratio/high_max": 0.006786069818190299, "clip_ratio/high_mean": 0.001501775755059498, "clip_ratio/low_mean": 0.0009733981687531923, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024751739038038068, "epoch": 0.09133034541883255, "grad_norm": 0.07531999796628952, "kl": 0.08811187744140625, "learning_rate": 1e-06, "loss": 0.0273, "step": 526 }, { "clip_ratio/high_max": 0.007594212482217699, "clip_ratio/high_mean": 0.001648624315748748, "clip_ratio/low_mean": 0.001176010097424296, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028246344008948654, "epoch": 0.09150397725422957, "grad_norm": 0.0700899064540863, "kl": 0.08880615234375, "learning_rate": 1e-06, "loss": 0.0271, "step": 527 }, { "clip_ratio/high_max": 0.008228601454902673, "clip_ratio/high_mean": 0.0018203305726274266, "clip_ratio/low_mean": 0.0013391160227911314, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031594466054229997, "epoch": 0.09167760908962658, "grad_norm": 0.06692631542682648, "kl": 0.0894622802734375, "learning_rate": 1e-06, "loss": 0.027, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2857142857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 1568.5045166015625, "completions/mean_terminated_length": 967.1062622070312, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.0918512409250236, "grad_norm": 0.19101616740226746, "kl": 0.12415313720703125, "learning_rate": 1e-06, "loss": 0.0548, "num_tokens": 59248532.0, "reward": 0.4776785969734192, "reward_std": 0.237474724650383, "rewards/accuracy_reward/mean": 0.4776785671710968, "rewards/accuracy_reward/std": 0.5000599026679993, "step": 529 }, { "clip_ratio/high_max": 0.004930730938212946, "clip_ratio/high_mean": 0.0012868361759501568, "clip_ratio/low_mean": 0.0007535923233490394, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020404284982760146, "epoch": 0.09202487276042062, "grad_norm": 0.09793981164693832, "kl": 0.1183929443359375, "learning_rate": 1e-06, "loss": 0.0549, "step": 530 }, { "clip_ratio/high_max": 0.005149885659193387, "clip_ratio/high_mean": 0.0013608545036731812, "clip_ratio/low_mean": 0.0008390456659981282, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021999001555741415, "epoch": 0.09219850459581765, "grad_norm": 0.09135421365499496, "kl": 0.10805511474609375, "learning_rate": 1e-06, "loss": 0.0546, "step": 531 }, { "clip_ratio/high_max": 0.005802119649160886, "clip_ratio/high_mean": 0.0015643643823750608, "clip_ratio/low_mean": 0.0009639787645028264, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002528343125959509, "epoch": 0.09237213643121467, "grad_norm": 0.09179678559303284, "kl": 0.10521697998046875, "learning_rate": 1e-06, "loss": 0.0544, "step": 532 }, { "clip_ratio/high_max": 0.0063509903538943036, "clip_ratio/high_mean": 0.00161700139108234, "clip_ratio/low_mean": 0.0013488403219525935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029658417888640543, "epoch": 0.09254576826661168, "grad_norm": 0.09043438732624054, "kl": 0.1098785400390625, "learning_rate": 1e-06, "loss": 0.0542, "step": 533 }, { "clip_ratio/high_max": 0.008001022644748446, "clip_ratio/high_mean": 0.002002693791382626, "clip_ratio/low_mean": 0.001296939617532189, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032996333775372477, "epoch": 0.0927194001020087, "grad_norm": 0.08607005327939987, "kl": 0.1045684814453125, "learning_rate": 1e-06, "loss": 0.054, "step": 534 }, { "clip_ratio/high_max": 0.009569516729243333, "clip_ratio/high_mean": 0.002404018957349763, "clip_ratio/low_mean": 0.0014697954329676577, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038738144057788304, "epoch": 0.09289303193740572, "grad_norm": 0.08027493953704834, "kl": 0.1036834716796875, "learning_rate": 1e-06, "loss": 0.0537, "step": 535 }, { "clip_ratio/high_max": 0.010337770687328884, "clip_ratio/high_mean": 0.002568217003272366, "clip_ratio/low_mean": 0.001676308389960468, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004244525296599022, "epoch": 0.09306666377280275, "grad_norm": 0.07873507589101791, "kl": 0.1016387939453125, "learning_rate": 1e-06, "loss": 0.0535, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 3072.0, "completions/max_terminated_length": 2938.0, "completions/mean_length": 1867.2991943359375, "completions/mean_terminated_length": 1278.956787109375, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.09324029560819977, "grad_norm": 43.396202087402344, "kl": 11.810546875, "learning_rate": 1e-06, "loss": 0.0589, "num_tokens": 60151642.0, "reward": 0.3370535969734192, "reward_std": 0.2460440844297409, "rewards/accuracy_reward/mean": 0.3370535671710968, "rewards/accuracy_reward/std": 0.47323182225227356, "step": 537 }, { "clip_ratio/high_max": 0.005274559662211686, "clip_ratio/high_mean": 0.0014384091750798689, "clip_ratio/low_mean": 0.0006385300782767445, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002076939215839957, "epoch": 0.09341392744359678, "grad_norm": 0.6719356179237366, "kl": 0.91815185546875, "learning_rate": 1e-06, "loss": 0.0481, "step": 538 }, { "clip_ratio/high_max": 0.005644504650263116, "clip_ratio/high_mean": 0.001522837148513645, "clip_ratio/low_mean": 0.000654898706898166, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002177735856093932, "epoch": 0.0935875592789938, "grad_norm": 0.09695355594158173, "kl": 0.135650634765625, "learning_rate": 1e-06, "loss": 0.0473, "step": 539 }, { "clip_ratio/high_max": 0.006939794671779964, "clip_ratio/high_mean": 0.0018865773399738828, "clip_ratio/low_mean": 0.0006943041830709262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025808815044001676, "epoch": 0.09376119111439082, "grad_norm": 0.11015574634075165, "kl": 0.09882354736328125, "learning_rate": 1e-06, "loss": 0.0472, "step": 540 }, { "clip_ratio/high_max": 0.007355666559305973, "clip_ratio/high_mean": 0.0019602352313086158, "clip_ratio/low_mean": 0.0008196704252441123, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002779905655188486, "epoch": 0.09393482294978785, "grad_norm": 0.10815519094467163, "kl": 0.0948638916015625, "learning_rate": 1e-06, "loss": 0.047, "step": 541 }, { "clip_ratio/high_max": 0.007229794711747672, "clip_ratio/high_mean": 0.0019797619388555177, "clip_ratio/low_mean": 0.0010098221800944884, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029895841325924266, "epoch": 0.09410845478518487, "grad_norm": 0.09395444393157959, "kl": 0.09300994873046875, "learning_rate": 1e-06, "loss": 0.0468, "step": 542 }, { "clip_ratio/high_max": 0.007187854025687557, "clip_ratio/high_mean": 0.0019107858215647866, "clip_ratio/low_mean": 0.0012856059297519096, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003196391753590433, "epoch": 0.09428208662058188, "grad_norm": 0.08563285320997238, "kl": 0.0964508056640625, "learning_rate": 1e-06, "loss": 0.0465, "step": 543 }, { "clip_ratio/high_max": 0.008668107700941619, "clip_ratio/high_mean": 0.002177841657612589, "clip_ratio/low_mean": 0.0013713472349081712, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003549188950273674, "epoch": 0.0944557184559789, "grad_norm": 0.08127689361572266, "kl": 0.09604644775390625, "learning_rate": 1e-06, "loss": 0.0463, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2991071428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2863.0, "completions/mean_length": 1659.0648193359375, "completions/mean_terminated_length": 1056.0924072265625, "completions/min_length": 304.0, "completions/min_terminated_length": 304.0, "epoch": 0.09462935029137592, "grad_norm": 0.16620483994483948, "kl": 0.18393707275390625, "learning_rate": 1e-06, "loss": 0.0438, "num_tokens": 60955591.0, "reward": 0.3683035969734192, "reward_std": 0.2862471044063568, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835725784302, "step": 545 }, { "clip_ratio/high_max": 0.006136221181805013, "clip_ratio/high_mean": 0.0018175233535657753, "clip_ratio/low_mean": 0.0009976908495445969, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002815214173097047, "epoch": 0.09480298212677295, "grad_norm": 0.12482815980911255, "kl": 0.14464569091796875, "learning_rate": 1e-06, "loss": 0.0438, "step": 546 }, { "clip_ratio/high_max": 0.007766747956338804, "clip_ratio/high_mean": 0.0021954572348477086, "clip_ratio/low_mean": 0.0009616345078029553, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031570917344652116, "epoch": 0.09497661396216997, "grad_norm": 0.12460190057754517, "kl": 0.1230010986328125, "learning_rate": 1e-06, "loss": 0.0436, "step": 547 }, { "clip_ratio/high_max": 0.007651208881725324, "clip_ratio/high_mean": 0.0022477567927126074, "clip_ratio/low_mean": 0.0012156477596363402, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00346340459145722, "epoch": 0.09515024579756698, "grad_norm": 0.11490438878536224, "kl": 0.11614227294921875, "learning_rate": 1e-06, "loss": 0.0433, "step": 548 }, { "clip_ratio/high_max": 0.0082824912678916, "clip_ratio/high_mean": 0.0023931116538733477, "clip_ratio/low_mean": 0.0015401204964291537, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003933232270355802, "epoch": 0.095323877632964, "grad_norm": 0.10520153492689133, "kl": 0.11707305908203125, "learning_rate": 1e-06, "loss": 0.043, "step": 549 }, { "clip_ratio/high_max": 0.010883668903261423, "clip_ratio/high_mean": 0.0031648834092266043, "clip_ratio/low_mean": 0.0017186409704663674, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0048835243833309505, "epoch": 0.09549750946836102, "grad_norm": 0.1033320426940918, "kl": 0.1082305908203125, "learning_rate": 1e-06, "loss": 0.0426, "step": 550 }, { "clip_ratio/high_max": 0.011999562375422101, "clip_ratio/high_mean": 0.0033463082636444597, "clip_ratio/low_mean": 0.0021084103909743135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0054547186628042255, "epoch": 0.09567114130375805, "grad_norm": 0.10134732723236084, "kl": 0.1169891357421875, "learning_rate": 1e-06, "loss": 0.0423, "step": 551 }, { "clip_ratio/high_max": 0.01298761030921014, "clip_ratio/high_mean": 0.0036355321335577173, "clip_ratio/low_mean": 0.0028167745485916384, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006452306697610766, "epoch": 0.09584477313915507, "grad_norm": 0.10426264256238937, "kl": 0.120025634765625, "learning_rate": 1e-06, "loss": 0.042, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3068.0, "completions/mean_length": 1637.55810546875, "completions/mean_terminated_length": 985.5389404296875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.09601840497455208, "grad_norm": 0.11766400933265686, "kl": 0.099365234375, "learning_rate": 1e-06, "loss": 0.028, "num_tokens": 61749505.0, "reward": 0.4196428656578064, "reward_std": 0.22153069078922272, "rewards/accuracy_reward/mean": 0.4196428656578064, "rewards/accuracy_reward/std": 0.4940522015094757, "step": 553 }, { "clip_ratio/high_max": 0.006735530809237389, "clip_ratio/high_mean": 0.001452049055842508, "clip_ratio/low_mean": 0.0006130068177299108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020650558844863554, "epoch": 0.0961920368099491, "grad_norm": 0.10147379338741302, "kl": 0.091949462890625, "learning_rate": 1e-06, "loss": 0.0279, "step": 554 }, { "clip_ratio/high_max": 0.006212017426150851, "clip_ratio/high_mean": 0.0013135956514815916, "clip_ratio/low_mean": 0.0008899748452222411, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022035704787413124, "epoch": 0.09636566864534613, "grad_norm": 0.09186606854200363, "kl": 0.098388671875, "learning_rate": 1e-06, "loss": 0.0278, "step": 555 }, { "clip_ratio/high_max": 0.006817566412792075, "clip_ratio/high_mean": 0.0013955279559922928, "clip_ratio/low_mean": 0.0010679431154585473, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024634710534883197, "epoch": 0.09653930048074315, "grad_norm": 0.08866243809461594, "kl": 0.10005950927734375, "learning_rate": 1e-06, "loss": 0.0276, "step": 556 }, { "clip_ratio/high_max": 0.007978694273333531, "clip_ratio/high_mean": 0.0016755089436628623, "clip_ratio/low_mean": 0.0011933800724364119, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002868889043384115, "epoch": 0.09671293231614017, "grad_norm": 0.0886266678571701, "kl": 0.0984954833984375, "learning_rate": 1e-06, "loss": 0.0275, "step": 557 }, { "clip_ratio/high_max": 0.009135340471402742, "clip_ratio/high_mean": 0.0018987349185408675, "clip_ratio/low_mean": 0.0015833903389648185, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034821252284018556, "epoch": 0.09688656415153718, "grad_norm": 0.08419225364923477, "kl": 0.0990753173828125, "learning_rate": 1e-06, "loss": 0.0272, "step": 558 }, { "clip_ratio/high_max": 0.009597108572052093, "clip_ratio/high_mean": 0.0020121890329392045, "clip_ratio/low_mean": 0.0018586831279208127, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003870872230436362, "epoch": 0.0970601959869342, "grad_norm": 0.08146754652261734, "kl": 0.10205078125, "learning_rate": 1e-06, "loss": 0.027, "step": 559 }, { "clip_ratio/high_max": 0.01155752845807001, "clip_ratio/high_mean": 0.0023416731783072464, "clip_ratio/low_mean": 0.00213109652941057, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004472769638596219, "epoch": 0.09723382782233123, "grad_norm": 0.11823230236768723, "kl": 0.097625732421875, "learning_rate": 1e-06, "loss": 0.0268, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 3072.0, "completions/max_terminated_length": 2872.0, "completions/mean_length": 1595.15185546875, "completions/mean_terminated_length": 1102.8690185546875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.09740745965772825, "grad_norm": 0.10981889069080353, "kl": 0.0927581787109375, "learning_rate": 1e-06, "loss": 0.065, "num_tokens": 62530589.0, "reward": 0.4375000298023224, "reward_std": 0.2476951777935028, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "step": 561 }, { "clip_ratio/high_max": 0.005128076698383666, "clip_ratio/high_mean": 0.0012049796046085248, "clip_ratio/low_mean": 0.0011319088316668058, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023368884048977634, "epoch": 0.09758109149312526, "grad_norm": 0.09857713431119919, "kl": 0.10174560546875, "learning_rate": 1e-06, "loss": 0.065, "step": 562 }, { "clip_ratio/high_max": 0.005855555798916612, "clip_ratio/high_mean": 0.0014390304554581235, "clip_ratio/low_mean": 0.0009178830905511859, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002356913557377993, "epoch": 0.09775472332852228, "grad_norm": 0.09842804819345474, "kl": 0.095672607421875, "learning_rate": 1e-06, "loss": 0.0648, "step": 563 }, { "clip_ratio/high_max": 0.0058435402697796235, "clip_ratio/high_mean": 0.0014386459370143712, "clip_ratio/low_mean": 0.0014010514869369217, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028396974012139253, "epoch": 0.0979283551639193, "grad_norm": 0.08719208091497421, "kl": 0.09973907470703125, "learning_rate": 1e-06, "loss": 0.0646, "step": 564 }, { "clip_ratio/high_max": 0.007282307968125679, "clip_ratio/high_mean": 0.0017613455170248926, "clip_ratio/low_mean": 0.0014158326976030366, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031771782087162137, "epoch": 0.09810198699931633, "grad_norm": 0.08644500374794006, "kl": 0.09926605224609375, "learning_rate": 1e-06, "loss": 0.0644, "step": 565 }, { "clip_ratio/high_max": 0.008534781292837579, "clip_ratio/high_mean": 0.0020554238799377345, "clip_ratio/low_mean": 0.0015869942371864454, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036424181307666004, "epoch": 0.09827561883471335, "grad_norm": 0.08533582836389542, "kl": 0.100555419921875, "learning_rate": 1e-06, "loss": 0.0642, "step": 566 }, { "clip_ratio/high_max": 0.009477095594775164, "clip_ratio/high_mean": 0.002303948711414705, "clip_ratio/low_mean": 0.0019423225830905722, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004246271259034984, "epoch": 0.09844925067011036, "grad_norm": 0.0848531574010849, "kl": 0.0963897705078125, "learning_rate": 1e-06, "loss": 0.0639, "step": 567 }, { "clip_ratio/high_max": 0.009404039625223959, "clip_ratio/high_mean": 0.002368519049923634, "clip_ratio/low_mean": 0.0023141248157116934, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004682643862906843, "epoch": 0.09862288250550738, "grad_norm": 0.08029569685459137, "kl": 0.097137451171875, "learning_rate": 1e-06, "loss": 0.0637, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3638392857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 1788.779052734375, "completions/mean_terminated_length": 1054.86669921875, "completions/min_length": 319.0, "completions/min_terminated_length": 319.0, "epoch": 0.0987965143409044, "grad_norm": 0.12710140645503998, "kl": 0.0978240966796875, "learning_rate": 1e-06, "loss": 0.0395, "num_tokens": 63398010.0, "reward": 0.3638392984867096, "reward_std": 0.2585868239402771, "rewards/accuracy_reward/mean": 0.3638392984867096, "rewards/accuracy_reward/std": 0.4816409945487976, "step": 569 }, { "clip_ratio/high_max": 0.006392422947101295, "clip_ratio/high_mean": 0.0017868810682557523, "clip_ratio/low_mean": 0.0008311235480960022, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002618004655232653, "epoch": 0.09897014617630143, "grad_norm": 0.1233564242720604, "kl": 0.09360504150390625, "learning_rate": 1e-06, "loss": 0.0394, "step": 570 }, { "clip_ratio/high_max": 0.007062843273160979, "clip_ratio/high_mean": 0.0018965445838148298, "clip_ratio/low_mean": 0.0010180270321598073, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029145716507628094, "epoch": 0.09914377801169845, "grad_norm": 0.10988181829452515, "kl": 0.097747802734375, "learning_rate": 1e-06, "loss": 0.0393, "step": 571 }, { "clip_ratio/high_max": 0.007143179482227424, "clip_ratio/high_mean": 0.0019113794151053298, "clip_ratio/low_mean": 0.00129123273507048, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032026121734816115, "epoch": 0.09931740984709546, "grad_norm": 0.1073874682188034, "kl": 0.1029205322265625, "learning_rate": 1e-06, "loss": 0.039, "step": 572 }, { "clip_ratio/high_max": 0.008461173856630921, "clip_ratio/high_mean": 0.0021601656608254416, "clip_ratio/low_mean": 0.0017629062147079821, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003923071904864628, "epoch": 0.09949104168249248, "grad_norm": 0.10695754736661911, "kl": 0.10521697998046875, "learning_rate": 1e-06, "loss": 0.0388, "step": 573 }, { "clip_ratio/high_max": 0.010303223709343001, "clip_ratio/high_mean": 0.0025707580953167053, "clip_ratio/low_mean": 0.0020094634505767317, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004580221477226587, "epoch": 0.0996646735178895, "grad_norm": 0.09764425456523895, "kl": 0.10555267333984375, "learning_rate": 1e-06, "loss": 0.0384, "step": 574 }, { "clip_ratio/high_max": 0.012060513254255056, "clip_ratio/high_mean": 0.003017170407474623, "clip_ratio/low_mean": 0.0021526303698919946, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005169800730072893, "epoch": 0.09983830535328653, "grad_norm": 0.08865787833929062, "kl": 0.1022491455078125, "learning_rate": 1e-06, "loss": 0.0381, "step": 575 }, { "clip_ratio/high_max": 0.014332153477880638, "clip_ratio/high_mean": 0.0035744860470003914, "clip_ratio/low_mean": 0.0023643549266125774, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059388410081737675, "epoch": 0.10001193718868355, "grad_norm": 0.08295557647943497, "kl": 0.09789276123046875, "learning_rate": 1e-06, "loss": 0.0377, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3058035714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3018.0, "completions/mean_length": 1706.404052734375, "completions/mean_terminated_length": 1104.8392333984375, "completions/min_length": 307.0, "completions/min_terminated_length": 307.0, "epoch": 0.10018556902408056, "grad_norm": 0.19500495493412018, "kl": 0.2429046630859375, "learning_rate": 1e-06, "loss": 0.0353, "num_tokens": 64224255.0, "reward": 0.3839285969734192, "reward_std": 0.23379170894622803, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "step": 577 }, { "clip_ratio/high_max": 0.006286250642006053, "clip_ratio/high_mean": 0.001494275507866405, "clip_ratio/low_mean": 0.0007400343183689984, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00223430980622652, "epoch": 0.10035920085947758, "grad_norm": 0.1114037036895752, "kl": 0.13885498046875, "learning_rate": 1e-06, "loss": 0.0353, "step": 578 }, { "clip_ratio/high_max": 0.007750870019663125, "clip_ratio/high_mean": 0.001870142865300295, "clip_ratio/low_mean": 0.0007488678459139919, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026190107050751976, "epoch": 0.1005328326948746, "grad_norm": 0.36090731620788574, "kl": 0.1166839599609375, "learning_rate": 1e-06, "loss": 0.0352, "step": 579 }, { "clip_ratio/high_max": 0.007673927255382296, "clip_ratio/high_mean": 0.0019447358972684015, "clip_ratio/low_mean": 0.0008571545658924151, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002801890468617785, "epoch": 0.10070646453027163, "grad_norm": 27.046974182128906, "kl": 7.421531677246094, "learning_rate": 1e-06, "loss": 0.0423, "step": 580 }, { "clip_ratio/high_max": 0.009609612352505792, "clip_ratio/high_mean": 0.002304474945049151, "clip_ratio/low_mean": 0.0009756106787790486, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032800856470203144, "epoch": 0.10088009636566865, "grad_norm": 2.726865768432617, "kl": 0.09847259521484375, "learning_rate": 1e-06, "loss": 0.0382, "step": 581 }, { "clip_ratio/high_max": 0.009724012430524454, "clip_ratio/high_mean": 0.0022768962699046824, "clip_ratio/low_mean": 0.001130573732780249, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034074700579367345, "epoch": 0.10105372820106566, "grad_norm": 0.09939669072628021, "kl": 0.10345458984375, "learning_rate": 1e-06, "loss": 0.0346, "step": 582 }, { "clip_ratio/high_max": 0.010378147650044411, "clip_ratio/high_mean": 0.002455281186485081, "clip_ratio/low_mean": 0.0013420643363133422, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003797345539169328, "epoch": 0.10122736003646268, "grad_norm": 0.6387702822685242, "kl": 0.32640838623046875, "learning_rate": 1e-06, "loss": 0.0346, "step": 583 }, { "clip_ratio/high_max": 0.010876988097152207, "clip_ratio/high_mean": 0.0025168874963128474, "clip_ratio/low_mean": 0.0015040707771731832, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004020958276669262, "epoch": 0.1014009918718597, "grad_norm": 0.0805836170911789, "kl": 0.1106719970703125, "learning_rate": 1e-06, "loss": 0.0342, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2388392857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3009.0, "completions/mean_length": 1519.477783203125, "completions/mean_terminated_length": 1032.3226318359375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.10157462370725673, "grad_norm": 0.11271578073501587, "kl": 0.1173858642578125, "learning_rate": 1e-06, "loss": 0.0408, "num_tokens": 64963637.0, "reward": 0.3883928656578064, "reward_std": 0.26550647616386414, "rewards/accuracy_reward/mean": 0.3883928656578064, "rewards/accuracy_reward/std": 0.4879295527935028, "step": 585 }, { "clip_ratio/high_max": 0.005394555311795557, "clip_ratio/high_mean": 0.0013139435513949138, "clip_ratio/low_mean": 0.0011565715703909518, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002470515105414961, "epoch": 0.10174825554265375, "grad_norm": 0.10426856577396393, "kl": 0.10958099365234375, "learning_rate": 1e-06, "loss": 0.0408, "step": 586 }, { "clip_ratio/high_max": 0.005028634666814469, "clip_ratio/high_mean": 0.0012524570236109867, "clip_ratio/low_mean": 0.0013131094583513914, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025655665194790345, "epoch": 0.10192188737805076, "grad_norm": 0.10317710787057877, "kl": 0.1150054931640625, "learning_rate": 1e-06, "loss": 0.0406, "step": 587 }, { "clip_ratio/high_max": 0.006025350779964356, "clip_ratio/high_mean": 0.0015027386043584556, "clip_ratio/low_mean": 0.001595267686752777, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030980063093011267, "epoch": 0.10209551921344778, "grad_norm": 0.09875162690877914, "kl": 0.11219024658203125, "learning_rate": 1e-06, "loss": 0.0404, "step": 588 }, { "clip_ratio/high_max": 0.0075166957030887716, "clip_ratio/high_mean": 0.0017996519322878157, "clip_ratio/low_mean": 0.0017799431407183874, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003579595066184993, "epoch": 0.1022691510488448, "grad_norm": 0.09448064118623734, "kl": 0.109161376953125, "learning_rate": 1e-06, "loss": 0.0402, "step": 589 }, { "clip_ratio/high_max": 0.007883637659688247, "clip_ratio/high_mean": 0.0019720190798580006, "clip_ratio/low_mean": 0.0021548836321017006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004126902738789795, "epoch": 0.10244278288424183, "grad_norm": 0.09079478681087494, "kl": 0.1074371337890625, "learning_rate": 1e-06, "loss": 0.0399, "step": 590 }, { "clip_ratio/high_max": 0.009548996880766936, "clip_ratio/high_mean": 0.002270152188884822, "clip_ratio/low_mean": 0.002616854782900191, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004887007104116492, "epoch": 0.10261641471963885, "grad_norm": 0.08643288910388947, "kl": 0.10916900634765625, "learning_rate": 1e-06, "loss": 0.0396, "step": 591 }, { "clip_ratio/high_max": 0.01028192684680107, "clip_ratio/high_mean": 0.0024950864794845984, "clip_ratio/low_mean": 0.0030912899383110926, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005586376330029452, "epoch": 0.10279004655503586, "grad_norm": 0.08412929624319077, "kl": 0.10770416259765625, "learning_rate": 1e-06, "loss": 0.0393, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3029.0, "completions/mean_length": 1397.5067138671875, "completions/mean_terminated_length": 970.6751098632812, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.10296367839043288, "grad_norm": 0.17691712081432343, "kl": 0.2560577392578125, "learning_rate": 1e-06, "loss": 0.038, "num_tokens": 65647432.0, "reward": 0.4799107313156128, "reward_std": 0.19456438720226288, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "step": 593 }, { "clip_ratio/high_max": 0.005694009323633509, "clip_ratio/high_mean": 0.0012101630068173108, "clip_ratio/low_mean": 0.0006558781694820937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001866041187895462, "epoch": 0.1031373102258299, "grad_norm": 0.10695333778858185, "kl": 0.15842437744140625, "learning_rate": 1e-06, "loss": 0.038, "step": 594 }, { "clip_ratio/high_max": 0.005729223863454536, "clip_ratio/high_mean": 0.0012476579477151972, "clip_ratio/low_mean": 0.0006936796066838724, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019413375521253329, "epoch": 0.10331094206122693, "grad_norm": 0.10245981812477112, "kl": 0.139617919921875, "learning_rate": 1e-06, "loss": 0.0378, "step": 595 }, { "clip_ratio/high_max": 0.007188443985796766, "clip_ratio/high_mean": 0.001495303678439086, "clip_ratio/low_mean": 0.000926336292650376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024216399833676405, "epoch": 0.10348457389662395, "grad_norm": 0.12869463860988617, "kl": 0.1253204345703125, "learning_rate": 1e-06, "loss": 0.0376, "step": 596 }, { "clip_ratio/high_max": 0.008010289988305885, "clip_ratio/high_mean": 0.0016901310464163544, "clip_ratio/low_mean": 0.0010547773672442418, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027449083645478822, "epoch": 0.10365820573202096, "grad_norm": 0.10314637422561646, "kl": 0.12419891357421875, "learning_rate": 1e-06, "loss": 0.0374, "step": 597 }, { "clip_ratio/high_max": 0.008524633129127324, "clip_ratio/high_mean": 0.001840933492530894, "clip_ratio/low_mean": 0.0015966253240549122, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003437558811128838, "epoch": 0.10383183756741798, "grad_norm": 0.08644825220108032, "kl": 0.130523681640625, "learning_rate": 1e-06, "loss": 0.0372, "step": 598 }, { "clip_ratio/high_max": 0.008395935514272423, "clip_ratio/high_mean": 0.0017839446468315145, "clip_ratio/low_mean": 0.0021065104265289847, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003890455100190593, "epoch": 0.104005469402815, "grad_norm": 0.11988653242588043, "kl": 0.166290283203125, "learning_rate": 1e-06, "loss": 0.037, "step": 599 }, { "clip_ratio/high_max": 0.010687976529879961, "clip_ratio/high_mean": 0.0021609123114103568, "clip_ratio/low_mean": 0.002254300816275645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0044152131376904435, "epoch": 0.10417910123821203, "grad_norm": 0.08074014633893967, "kl": 0.136566162109375, "learning_rate": 1e-06, "loss": 0.0368, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1651785714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3044.0, "completions/mean_length": 1336.984375, "completions/mean_terminated_length": 993.6925048828125, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.10435273307360905, "grad_norm": 0.13911767303943634, "kl": 0.14630126953125, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 66305793.0, "reward": 0.4263392984867096, "reward_std": 0.2752034664154053, "rewards/accuracy_reward/mean": 0.4263392984867096, "rewards/accuracy_reward/std": 0.49509719014167786, "step": 601 }, { "clip_ratio/high_max": 0.0055149031250039116, "clip_ratio/high_mean": 0.0014354384961734468, "clip_ratio/low_mean": 0.00116749943481409, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00260293788232957, "epoch": 0.10452636490900606, "grad_norm": 0.1171986535191536, "kl": 0.123138427734375, "learning_rate": 1e-06, "loss": 0.0329, "step": 602 }, { "clip_ratio/high_max": 0.005914551958994707, "clip_ratio/high_mean": 0.0015293264009414997, "clip_ratio/low_mean": 0.0014771027490496635, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030064291531743947, "epoch": 0.10469999674440308, "grad_norm": 0.11808642745018005, "kl": 0.1215972900390625, "learning_rate": 1e-06, "loss": 0.0327, "step": 603 }, { "clip_ratio/high_max": 0.007240568207635079, "clip_ratio/high_mean": 0.0018537420010034111, "clip_ratio/low_mean": 0.001546918128951802, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034006601035798667, "epoch": 0.10487362857980011, "grad_norm": 0.11788731068372726, "kl": 0.1138153076171875, "learning_rate": 1e-06, "loss": 0.0324, "step": 604 }, { "clip_ratio/high_max": 0.00810379660106264, "clip_ratio/high_mean": 0.0020948511528331437, "clip_ratio/low_mean": 0.0017623507919779513, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003857201942082611, "epoch": 0.10504726041519713, "grad_norm": 0.11519483476877213, "kl": 0.111968994140625, "learning_rate": 1e-06, "loss": 0.0321, "step": 605 }, { "clip_ratio/high_max": 0.009337175255495822, "clip_ratio/high_mean": 0.002409870093288191, "clip_ratio/low_mean": 0.0023295229893847136, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004739393070849474, "epoch": 0.10522089225059415, "grad_norm": 0.09988521039485931, "kl": 0.115386962890625, "learning_rate": 1e-06, "loss": 0.0318, "step": 606 }, { "clip_ratio/high_max": 0.009714379586512223, "clip_ratio/high_mean": 0.0024727845875531784, "clip_ratio/low_mean": 0.0029646154853253393, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005437399973743595, "epoch": 0.10539452408599116, "grad_norm": 0.09596744924783707, "kl": 0.121002197265625, "learning_rate": 1e-06, "loss": 0.0315, "step": 607 }, { "clip_ratio/high_max": 0.011864491872984217, "clip_ratio/high_mean": 0.002960638159493101, "clip_ratio/low_mean": 0.0035092678408545908, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006469906074926257, "epoch": 0.10556815592138818, "grad_norm": 0.1032036691904068, "kl": 0.1242828369140625, "learning_rate": 1e-06, "loss": 0.0312, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2950.0, "completions/mean_length": 1543.212158203125, "completions/mean_terminated_length": 990.2462158203125, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.10574178775678521, "grad_norm": 0.10885374993085861, "kl": 0.12007904052734375, "learning_rate": 1e-06, "loss": 0.0478, "num_tokens": 67057456.0, "reward": 0.3839285969734192, "reward_std": 0.1945657879114151, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "step": 609 }, { "clip_ratio/high_max": 0.0054692756948497845, "clip_ratio/high_mean": 0.0012177309135950054, "clip_ratio/low_mean": 0.0007838897959118185, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002001620671762794, "epoch": 0.10591541959218223, "grad_norm": 0.10313685238361359, "kl": 0.1143798828125, "learning_rate": 1e-06, "loss": 0.0478, "step": 610 }, { "clip_ratio/high_max": 0.005088049692858476, "clip_ratio/high_mean": 0.0011314754724480736, "clip_ratio/low_mean": 0.0009782148299564142, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021096903201396344, "epoch": 0.10608905142757925, "grad_norm": 0.09435322880744934, "kl": 0.12317657470703125, "learning_rate": 1e-06, "loss": 0.0476, "step": 611 }, { "clip_ratio/high_max": 0.005201743919315049, "clip_ratio/high_mean": 0.0012035260301672679, "clip_ratio/low_mean": 0.0012996776877116645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025032037501659943, "epoch": 0.10626268326297626, "grad_norm": 0.0925537571310997, "kl": 0.133056640625, "learning_rate": 1e-06, "loss": 0.0475, "step": 612 }, { "clip_ratio/high_max": 0.006548590201418847, "clip_ratio/high_mean": 0.0015049730668579286, "clip_ratio/low_mean": 0.001528971481093322, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003033944562957913, "epoch": 0.10643631509837329, "grad_norm": 0.09320832043886185, "kl": 0.12696075439453125, "learning_rate": 1e-06, "loss": 0.0473, "step": 613 }, { "clip_ratio/high_max": 0.007854046463762643, "clip_ratio/high_mean": 0.0018320681883778889, "clip_ratio/low_mean": 0.0017995815105678048, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003631649668022874, "epoch": 0.10660994693377031, "grad_norm": 0.0831611230969429, "kl": 0.120147705078125, "learning_rate": 1e-06, "loss": 0.047, "step": 614 }, { "clip_ratio/high_max": 0.008646402748127002, "clip_ratio/high_mean": 0.0019904277019122674, "clip_ratio/low_mean": 0.002092553202601266, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004082980840394157, "epoch": 0.10678357876916733, "grad_norm": 0.08673538267612457, "kl": 0.1242218017578125, "learning_rate": 1e-06, "loss": 0.0468, "step": 615 }, { "clip_ratio/high_max": 0.010695587559894193, "clip_ratio/high_mean": 0.002425488423796196, "clip_ratio/low_mean": 0.0023235598291648785, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004749048292069347, "epoch": 0.10695721060456434, "grad_norm": 0.0740213394165039, "kl": 0.11742401123046875, "learning_rate": 1e-06, "loss": 0.0465, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2790178571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1585.63623046875, "completions/mean_terminated_length": 1010.4179077148438, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "epoch": 0.10713084243996136, "grad_norm": 0.11364158987998962, "kl": 0.143096923828125, "learning_rate": 1e-06, "loss": 0.0289, "num_tokens": 67831293.0, "reward": 0.3348214328289032, "reward_std": 0.21515221893787384, "rewards/accuracy_reward/mean": 0.3348214328289032, "rewards/accuracy_reward/std": 0.47245556116104126, "step": 617 }, { "clip_ratio/high_max": 0.004954819902195595, "clip_ratio/high_mean": 0.001161668783879577, "clip_ratio/low_mean": 0.000782760668357696, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019444294375716709, "epoch": 0.10730447427535839, "grad_norm": 0.1085372194647789, "kl": 0.13299560546875, "learning_rate": 1e-06, "loss": 0.0289, "step": 618 }, { "clip_ratio/high_max": 0.005875223014299991, "clip_ratio/high_mean": 0.0013006447156840295, "clip_ratio/low_mean": 0.0008700122937170818, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002170656973248697, "epoch": 0.10747810611075541, "grad_norm": 0.10767918825149536, "kl": 0.12372589111328125, "learning_rate": 1e-06, "loss": 0.0287, "step": 619 }, { "clip_ratio/high_max": 0.006734420976499678, "clip_ratio/high_mean": 0.0014982617044552171, "clip_ratio/low_mean": 0.0011177541266533808, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002616015799503657, "epoch": 0.10765173794615243, "grad_norm": 0.10217170417308807, "kl": 0.129852294921875, "learning_rate": 1e-06, "loss": 0.0285, "step": 620 }, { "clip_ratio/high_max": 0.007117597890101024, "clip_ratio/high_mean": 0.0016002274546735862, "clip_ratio/low_mean": 0.0012737485890284006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028739761110045947, "epoch": 0.10782536978154944, "grad_norm": 0.09558083862066269, "kl": 0.128082275390625, "learning_rate": 1e-06, "loss": 0.0283, "step": 621 }, { "clip_ratio/high_max": 0.008474279997244594, "clip_ratio/high_mean": 0.0018609785397529777, "clip_ratio/low_mean": 0.0017042992271854018, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035652777787618106, "epoch": 0.10799900161694646, "grad_norm": 0.08518858253955841, "kl": 0.13482666015625, "learning_rate": 1e-06, "loss": 0.028, "step": 622 }, { "clip_ratio/high_max": 0.008925316536988248, "clip_ratio/high_mean": 0.0019800586374003615, "clip_ratio/low_mean": 0.0021166963729228883, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004096755001228303, "epoch": 0.10817263345234349, "grad_norm": 0.08757380396127701, "kl": 0.1350860595703125, "learning_rate": 1e-06, "loss": 0.0278, "step": 623 }, { "clip_ratio/high_max": 0.010995861204719404, "clip_ratio/high_mean": 0.0024121919163917482, "clip_ratio/low_mean": 0.0024892558349165483, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004901447748125065, "epoch": 0.10834626528774051, "grad_norm": 0.10320673137903214, "kl": 0.1327972412109375, "learning_rate": 1e-06, "loss": 0.0276, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2834821428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2917.0, "completions/mean_length": 1623.6920166015625, "completions/mean_terminated_length": 1050.685302734375, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.10851989712313753, "grad_norm": 0.09791652113199234, "kl": 0.1237030029296875, "learning_rate": 1e-06, "loss": 0.0253, "num_tokens": 68626411.0, "reward": 0.3727678656578064, "reward_std": 0.20215803384780884, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "step": 625 }, { "clip_ratio/high_max": 0.004580806033118279, "clip_ratio/high_mean": 0.001037203879604931, "clip_ratio/low_mean": 0.0006873573547636624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017245612243641517, "epoch": 0.10869352895853454, "grad_norm": 0.09922773391008377, "kl": 0.1281280517578125, "learning_rate": 1e-06, "loss": 0.0254, "step": 626 }, { "clip_ratio/high_max": 0.004210153889289359, "clip_ratio/high_mean": 0.0010292773938544997, "clip_ratio/low_mean": 0.0009714083932976791, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002000685809434799, "epoch": 0.10886716079393156, "grad_norm": 0.0895802453160286, "kl": 0.1264495849609375, "learning_rate": 1e-06, "loss": 0.0252, "step": 627 }, { "clip_ratio/high_max": 0.0047850209011812694, "clip_ratio/high_mean": 0.001175729528085867, "clip_ratio/low_mean": 0.0008551188204819482, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020308483449298365, "epoch": 0.10904079262932859, "grad_norm": 0.2808818221092224, "kl": 0.1244964599609375, "learning_rate": 1e-06, "loss": 0.0251, "step": 628 }, { "clip_ratio/high_max": 0.005413979732111329, "clip_ratio/high_mean": 0.0013241871797617932, "clip_ratio/low_mean": 0.0009983442080283567, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023225314098453964, "epoch": 0.10921442446472561, "grad_norm": 0.12357468158006668, "kl": 0.1405181884765625, "learning_rate": 1e-06, "loss": 0.0249, "step": 629 }, { "clip_ratio/high_max": 0.006302729741946678, "clip_ratio/high_mean": 0.0015348849576639623, "clip_ratio/low_mean": 0.0012404986764522619, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00277538360478502, "epoch": 0.10938805630012263, "grad_norm": 1.025601863861084, "kl": 0.4533233642578125, "learning_rate": 1e-06, "loss": 0.025, "step": 630 }, { "clip_ratio/high_max": 0.007136957014154177, "clip_ratio/high_mean": 0.0017864025053313526, "clip_ratio/low_mean": 0.0013864639174698823, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031728663670946844, "epoch": 0.10956168813551964, "grad_norm": 5.083588600158691, "kl": 0.113372802734375, "learning_rate": 1e-06, "loss": 0.0307, "step": 631 }, { "clip_ratio/high_max": 0.00861251260357676, "clip_ratio/high_mean": 0.0020784286798516405, "clip_ratio/low_mean": 0.001626922595733049, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037053513324281084, "epoch": 0.10973531997091666, "grad_norm": 1.3815165758132935, "kl": 0.113677978515625, "learning_rate": 1e-06, "loss": 0.0258, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 3072.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 1646.857177734375, "completions/mean_terminated_length": 1045.130126953125, "completions/min_length": 179.0, "completions/min_terminated_length": 179.0, "epoch": 0.10990895180631369, "grad_norm": 0.15000373125076294, "kl": 0.1589202880859375, "learning_rate": 1e-06, "loss": 0.0237, "num_tokens": 69424403.0, "reward": 0.3794642984867096, "reward_std": 0.24025489389896393, "rewards/accuracy_reward/mean": 0.3794642984867096, "rewards/accuracy_reward/std": 0.485796183347702, "step": 633 }, { "clip_ratio/high_max": 0.0055441889162466396, "clip_ratio/high_mean": 0.0014373042524766788, "clip_ratio/low_mean": 0.0009175344907816907, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023548387480332167, "epoch": 0.11008258364171071, "grad_norm": 0.11633960902690887, "kl": 0.1384735107421875, "learning_rate": 1e-06, "loss": 0.0238, "step": 634 }, { "clip_ratio/high_max": 0.006624595949688228, "clip_ratio/high_mean": 0.0017297121714818786, "clip_ratio/low_mean": 0.001150295340721641, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028800074987884727, "epoch": 0.11025621547710773, "grad_norm": 0.12486882507801056, "kl": 0.1225128173828125, "learning_rate": 1e-06, "loss": 0.0236, "step": 635 }, { "clip_ratio/high_max": 0.007715124316746369, "clip_ratio/high_mean": 0.001958971644853591, "clip_ratio/low_mean": 0.0015582855112370453, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035172571524526575, "epoch": 0.11042984731250474, "grad_norm": 0.12910649180412292, "kl": 0.1269683837890625, "learning_rate": 1e-06, "loss": 0.0234, "step": 636 }, { "clip_ratio/high_max": 0.008155132440151647, "clip_ratio/high_mean": 0.002080389467664645, "clip_ratio/low_mean": 0.0018033571022897377, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038837465253891423, "epoch": 0.11060347914790176, "grad_norm": 0.11794328689575195, "kl": 0.1288909912109375, "learning_rate": 1e-06, "loss": 0.0231, "step": 637 }, { "clip_ratio/high_max": 0.008817670786811505, "clip_ratio/high_mean": 0.0022277203765952436, "clip_ratio/low_mean": 0.002179201845137868, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004406922261296131, "epoch": 0.11077711098329879, "grad_norm": 0.09746332466602325, "kl": 0.1331329345703125, "learning_rate": 1e-06, "loss": 0.0228, "step": 638 }, { "clip_ratio/high_max": 0.010396712434157962, "clip_ratio/high_mean": 0.0024670998282090295, "clip_ratio/low_mean": 0.002564172806160059, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00503127262891212, "epoch": 0.11095074281869581, "grad_norm": 0.09589476138353348, "kl": 0.1372222900390625, "learning_rate": 1e-06, "loss": 0.0225, "step": 639 }, { "clip_ratio/high_max": 0.011701288836775348, "clip_ratio/high_mean": 0.0027457954738565604, "clip_ratio/low_mean": 0.0028469257658798597, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005592721194261685, "epoch": 0.11112437465409283, "grad_norm": 0.10442132502794266, "kl": 0.1424102783203125, "learning_rate": 1e-06, "loss": 0.0222, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3080357142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 3071.0, "completions/mean_length": 1624.5848388671875, "completions/mean_terminated_length": 980.2515869140625, "completions/min_length": 263.0, "completions/min_terminated_length": 263.0, "epoch": 0.11129800648948984, "grad_norm": 1.8149919509887695, "kl": 0.8163604736328125, "learning_rate": 1e-06, "loss": 0.0393, "num_tokens": 70214433.0, "reward": 0.3660714328289032, "reward_std": 0.17952683568000793, "rewards/accuracy_reward/mean": 0.3660714328289032, "rewards/accuracy_reward/std": 0.482267826795578, "step": 641 }, { "clip_ratio/high_max": 0.005008247626392404, "clip_ratio/high_mean": 0.0010314847327208554, "clip_ratio/low_mean": 0.0006924079694954344, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001723892715290276, "epoch": 0.11147163832488687, "grad_norm": 0.18737336993217468, "kl": 0.1542816162109375, "learning_rate": 1e-06, "loss": 0.0387, "step": 642 }, { "clip_ratio/high_max": 0.006144755028799409, "clip_ratio/high_mean": 0.001228337164320692, "clip_ratio/low_mean": 0.0008047753171922523, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020331124933363753, "epoch": 0.11164527016028389, "grad_norm": 8.555008888244629, "kl": 0.12164306640625, "learning_rate": 1e-06, "loss": 0.0403, "step": 643 }, { "clip_ratio/high_max": 0.0074822317947109696, "clip_ratio/high_mean": 0.0014768370142519416, "clip_ratio/low_mean": 0.0007906369653483125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022674739611829864, "epoch": 0.11181890199568091, "grad_norm": 3.671978712081909, "kl": 0.125335693359375, "learning_rate": 1e-06, "loss": 0.0393, "step": 644 }, { "clip_ratio/high_max": 0.007066616504744161, "clip_ratio/high_mean": 0.0013716237876906234, "clip_ratio/low_mean": 0.000928546912746242, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023001707004368654, "epoch": 0.11199253383107793, "grad_norm": 0.23054051399230957, "kl": 0.136260986328125, "learning_rate": 1e-06, "loss": 0.0385, "step": 645 }, { "clip_ratio/high_max": 0.0066148156147392, "clip_ratio/high_mean": 0.0012946421588821977, "clip_ratio/low_mean": 0.0009615306166779192, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022561727400898235, "epoch": 0.11216616566647494, "grad_norm": 0.21971940994262695, "kl": 0.2414093017578125, "learning_rate": 1e-06, "loss": 0.0384, "step": 646 }, { "clip_ratio/high_max": 0.006858716020360589, "clip_ratio/high_mean": 0.0013446965076582273, "clip_ratio/low_mean": 0.0012647373073377821, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026094337772519793, "epoch": 0.11233979750187197, "grad_norm": 0.6919918656349182, "kl": 0.5586700439453125, "learning_rate": 1e-06, "loss": 0.0386, "step": 647 }, { "clip_ratio/high_max": 0.008649084946227958, "clip_ratio/high_mean": 0.0016719942618692585, "clip_ratio/low_mean": 0.0014033928246135474, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030753870796615956, "epoch": 0.11251342933726899, "grad_norm": 0.09924893826246262, "kl": 0.175811767578125, "learning_rate": 1e-06, "loss": 0.038, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2477678571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 2957.0, "completions/mean_length": 1554.8035888671875, "completions/mean_terminated_length": 1055.07421875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.11268706117266601, "grad_norm": 0.13468952476978302, "kl": 0.21826171875, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 70982481.0, "reward": 0.4799107313156128, "reward_std": 0.27474841475486755, "rewards/accuracy_reward/mean": 0.4799107015132904, "rewards/accuracy_reward/std": 0.5001547932624817, "step": 649 }, { "clip_ratio/high_max": 0.006694545449136058, "clip_ratio/high_mean": 0.0017772323394638079, "clip_ratio/low_mean": 0.000877131742072379, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002654364063346293, "epoch": 0.11286069300806303, "grad_norm": 0.11200286448001862, "kl": 0.1877593994140625, "learning_rate": 1e-06, "loss": 0.0446, "step": 650 }, { "clip_ratio/high_max": 0.007458576750650536, "clip_ratio/high_mean": 0.0020431414873200993, "clip_ratio/low_mean": 0.0009460123428652878, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029891537997173145, "epoch": 0.11303432484346004, "grad_norm": 0.11584542691707611, "kl": 0.1470794677734375, "learning_rate": 1e-06, "loss": 0.0444, "step": 651 }, { "clip_ratio/high_max": 0.007804542030498851, "clip_ratio/high_mean": 0.0021164253166716662, "clip_ratio/low_mean": 0.0012075130052835448, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033239383046748117, "epoch": 0.11320795667885707, "grad_norm": 0.1036299541592598, "kl": 0.1316986083984375, "learning_rate": 1e-06, "loss": 0.0442, "step": 652 }, { "clip_ratio/high_max": 0.009218615923600737, "clip_ratio/high_mean": 0.0025042151510206168, "clip_ratio/low_mean": 0.001318627514592663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003822842649242375, "epoch": 0.11338158851425409, "grad_norm": 0.09888607263565063, "kl": 0.1257171630859375, "learning_rate": 1e-06, "loss": 0.044, "step": 653 }, { "clip_ratio/high_max": 0.009359388954180758, "clip_ratio/high_mean": 0.0025433412129132194, "clip_ratio/low_mean": 0.0016620860442344565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004205427205306478, "epoch": 0.11355522034965111, "grad_norm": 0.09465382248163223, "kl": 0.1264495849609375, "learning_rate": 1e-06, "loss": 0.0437, "step": 654 }, { "clip_ratio/high_max": 0.010163304643356241, "clip_ratio/high_mean": 0.0027474493535919464, "clip_ratio/low_mean": 0.0020441164783733257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004791565879713744, "epoch": 0.11372885218504813, "grad_norm": 0.09865810722112656, "kl": 0.1263427734375, "learning_rate": 1e-06, "loss": 0.0435, "step": 655 }, { "clip_ratio/high_max": 0.011848383241158444, "clip_ratio/high_mean": 0.003093421019912057, "clip_ratio/low_mean": 0.002429043150641519, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0055224641764652915, "epoch": 0.11390248402044514, "grad_norm": 0.09180879592895508, "kl": 0.1252288818359375, "learning_rate": 1e-06, "loss": 0.0432, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3058035714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 2935.0, "completions/mean_length": 1631.7366943359375, "completions/mean_terminated_length": 997.2797241210938, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.11407611585584217, "grad_norm": 0.11302841454744339, "kl": 0.12689971923828125, "learning_rate": 1e-06, "loss": 0.0373, "num_tokens": 71772483.0, "reward": 0.3526785969734192, "reward_std": 0.20673204958438873, "rewards/accuracy_reward/mean": 0.3526785671710968, "rewards/accuracy_reward/std": 0.4783378541469574, "step": 657 }, { "clip_ratio/high_max": 0.004071630050020758, "clip_ratio/high_mean": 0.0009828667107285582, "clip_ratio/low_mean": 0.0007458906945885246, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00172875741009193, "epoch": 0.11424974769123919, "grad_norm": 0.10729670524597168, "kl": 0.12036895751953125, "learning_rate": 1e-06, "loss": 0.0374, "step": 658 }, { "clip_ratio/high_max": 0.004821415916012484, "clip_ratio/high_mean": 0.0011523424529968906, "clip_ratio/low_mean": 0.0007562097716800054, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019085522494606266, "epoch": 0.11442337952663621, "grad_norm": 0.10813155025243759, "kl": 0.11545562744140625, "learning_rate": 1e-06, "loss": 0.0373, "step": 659 }, { "clip_ratio/high_max": 0.004848478449275717, "clip_ratio/high_mean": 0.0011809233560597932, "clip_ratio/low_mean": 0.001028798935294617, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002209722280895221, "epoch": 0.11459701136203324, "grad_norm": 0.09106927365064621, "kl": 0.126434326171875, "learning_rate": 1e-06, "loss": 0.0371, "step": 660 }, { "clip_ratio/high_max": 0.006102633482441888, "clip_ratio/high_mean": 0.001405882280323567, "clip_ratio/low_mean": 0.001174745398657251, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00258062767352385, "epoch": 0.11477064319743024, "grad_norm": 0.08081386983394623, "kl": 0.12450408935546875, "learning_rate": 1e-06, "loss": 0.0369, "step": 661 }, { "clip_ratio/high_max": 0.0067041659967799205, "clip_ratio/high_mean": 0.0015445179615198867, "clip_ratio/low_mean": 0.0015004538267930911, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003044971779672778, "epoch": 0.11494427503282727, "grad_norm": 0.07732100784778595, "kl": 0.12770843505859375, "learning_rate": 1e-06, "loss": 0.0367, "step": 662 }, { "clip_ratio/high_max": 0.006915974619914778, "clip_ratio/high_mean": 0.001667089555212442, "clip_ratio/low_mean": 0.0017013266024150653, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033684162008285057, "epoch": 0.11511790686822429, "grad_norm": 0.0785394236445427, "kl": 0.13162994384765625, "learning_rate": 1e-06, "loss": 0.0365, "step": 663 }, { "clip_ratio/high_max": 0.008694551546795992, "clip_ratio/high_mean": 0.002062492607819877, "clip_ratio/low_mean": 0.0018805137447088782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003943006404369953, "epoch": 0.11529153870362131, "grad_norm": 0.06976956874132156, "kl": 0.126678466796875, "learning_rate": 1e-06, "loss": 0.0363, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2901785714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 1594.6473388671875, "completions/mean_terminated_length": 990.6980590820312, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.11546517053901834, "grad_norm": 0.20943261682987213, "kl": 0.2438507080078125, "learning_rate": 1e-06, "loss": 0.0167, "num_tokens": 72546485.0, "reward": 0.314732164144516, "reward_std": 0.2107923924922943, "rewards/accuracy_reward/mean": 0.3147321343421936, "rewards/accuracy_reward/std": 0.4649282693862915, "step": 665 }, { "clip_ratio/high_max": 0.004340030216553714, "clip_ratio/high_mean": 0.0009781416074474691, "clip_ratio/low_mean": 0.000743171904105111, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017213135261044954, "epoch": 0.11563880237441534, "grad_norm": 0.09847511351108551, "kl": 0.15857696533203125, "learning_rate": 1e-06, "loss": 0.0167, "step": 666 }, { "clip_ratio/high_max": 0.005714030405215453, "clip_ratio/high_mean": 0.0012609584396159335, "clip_ratio/low_mean": 0.0007972235962370178, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002058182022665278, "epoch": 0.11581243420981237, "grad_norm": 0.12540245056152344, "kl": 0.121429443359375, "learning_rate": 1e-06, "loss": 0.0165, "step": 667 }, { "clip_ratio/high_max": 0.006478336581494659, "clip_ratio/high_mean": 0.0013979785576339054, "clip_ratio/low_mean": 0.0010091426129292813, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024071211873888387, "epoch": 0.11598606604520939, "grad_norm": 0.12482191622257233, "kl": 0.1199493408203125, "learning_rate": 1e-06, "loss": 0.0164, "step": 668 }, { "clip_ratio/high_max": 0.006452807010646211, "clip_ratio/high_mean": 0.0014364958365149505, "clip_ratio/low_mean": 0.0010187153716287867, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024552112445235252, "epoch": 0.11615969788060641, "grad_norm": 0.11775334179401398, "kl": 0.1210174560546875, "learning_rate": 1e-06, "loss": 0.0162, "step": 669 }, { "clip_ratio/high_max": 0.00746102891935152, "clip_ratio/high_mean": 0.0015784647334839974, "clip_ratio/low_mean": 0.0013617688846352394, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029402336385828676, "epoch": 0.11633332971600342, "grad_norm": 0.08998902142047882, "kl": 0.12697601318359375, "learning_rate": 1e-06, "loss": 0.0159, "step": 670 }, { "clip_ratio/high_max": 0.008106981666060165, "clip_ratio/high_mean": 0.00170842649822589, "clip_ratio/low_mean": 0.0017561126533109928, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034645390951482113, "epoch": 0.11650696155140045, "grad_norm": 0.10009998083114624, "kl": 0.134307861328125, "learning_rate": 1e-06, "loss": 0.0157, "step": 671 }, { "clip_ratio/high_max": 0.009128668174525956, "clip_ratio/high_mean": 0.0019058152752222668, "clip_ratio/low_mean": 0.002222849825557205, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004128665132157039, "epoch": 0.11668059338679747, "grad_norm": 0.10441833734512329, "kl": 0.14472198486328125, "learning_rate": 1e-06, "loss": 0.0155, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2566964285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3066.0, "completions/mean_length": 1507.3282470703125, "completions/mean_terminated_length": 966.9760131835938, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "epoch": 0.11685422522219449, "grad_norm": 0.20622572302818298, "kl": 0.36383056640625, "learning_rate": 1e-06, "loss": 0.046, "num_tokens": 73282872.0, "reward": 0.4151785969734192, "reward_std": 0.25949332118034363, "rewards/accuracy_reward/mean": 0.4151785671710968, "rewards/accuracy_reward/std": 0.49330365657806396, "step": 673 }, { "clip_ratio/high_max": 0.0060627071288763545, "clip_ratio/high_mean": 0.0015796721800143132, "clip_ratio/low_mean": 0.0009206651002386934, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025003372866194695, "epoch": 0.11702785705759151, "grad_norm": 0.13123495876789093, "kl": 0.22841644287109375, "learning_rate": 1e-06, "loss": 0.0458, "step": 674 }, { "clip_ratio/high_max": 0.0060459511005319655, "clip_ratio/high_mean": 0.001563952174365113, "clip_ratio/low_mean": 0.0011133524185424903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026773045919981087, "epoch": 0.11720148889298852, "grad_norm": 0.11944543570280075, "kl": 0.17305755615234375, "learning_rate": 1e-06, "loss": 0.0456, "step": 675 }, { "clip_ratio/high_max": 0.007238270074594766, "clip_ratio/high_mean": 0.0018776575761876302, "clip_ratio/low_mean": 0.0013101268332320615, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003187784439433017, "epoch": 0.11737512072838555, "grad_norm": 0.10852908343076706, "kl": 0.15468597412109375, "learning_rate": 1e-06, "loss": 0.0453, "step": 676 }, { "clip_ratio/high_max": 0.008869268116541207, "clip_ratio/high_mean": 0.0021505715512830648, "clip_ratio/low_mean": 0.001730606850287586, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038811784870631527, "epoch": 0.11754875256378257, "grad_norm": 0.10218239575624466, "kl": 0.16230010986328125, "learning_rate": 1e-06, "loss": 0.045, "step": 677 }, { "clip_ratio/high_max": 0.009799962528632022, "clip_ratio/high_mean": 0.0024399198864557547, "clip_ratio/low_mean": 0.0021070182065159315, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004546938009298174, "epoch": 0.11772238439917959, "grad_norm": 0.11909385025501251, "kl": 0.16156768798828125, "learning_rate": 1e-06, "loss": 0.0447, "step": 678 }, { "clip_ratio/high_max": 0.010921474422502797, "clip_ratio/high_mean": 0.0027124995413032593, "clip_ratio/low_mean": 0.0026146303016503225, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0053271298347681295, "epoch": 0.11789601623457661, "grad_norm": 0.11023889482021332, "kl": 0.154998779296875, "learning_rate": 1e-06, "loss": 0.0444, "step": 679 }, { "clip_ratio/high_max": 0.014046735130250454, "clip_ratio/high_mean": 0.0034309635948375217, "clip_ratio/low_mean": 0.0030119201919660554, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006442883921408793, "epoch": 0.11806964806997362, "grad_norm": 0.15841712057590485, "kl": 0.1901397705078125, "learning_rate": 1e-06, "loss": 0.0442, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2075892857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 2831.0, "completions/mean_length": 1367.1004638671875, "completions/mean_terminated_length": 920.4647827148438, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.11824327990537065, "grad_norm": 0.12350305914878845, "kl": 0.1192169189453125, "learning_rate": 1e-06, "loss": 0.0128, "num_tokens": 73964229.0, "reward": 0.3816964328289032, "reward_std": 0.19456438720226288, "rewards/accuracy_reward/mean": 0.3816964328289032, "rewards/accuracy_reward/std": 0.4863457679748535, "step": 681 }, { "clip_ratio/high_max": 0.005486613659741124, "clip_ratio/high_mean": 0.0013437889379019907, "clip_ratio/low_mean": 0.0007331819833780173, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020769709244632395, "epoch": 0.11841691174076767, "grad_norm": 0.1259164810180664, "kl": 0.1223907470703125, "learning_rate": 1e-06, "loss": 0.0129, "step": 682 }, { "clip_ratio/high_max": 0.005517669545952231, "clip_ratio/high_mean": 0.001383084224016784, "clip_ratio/low_mean": 0.0007241846897159121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002107268943291274, "epoch": 0.11859054357616469, "grad_norm": 0.13277296721935272, "kl": 0.1185302734375, "learning_rate": 1e-06, "loss": 0.0127, "step": 683 }, { "clip_ratio/high_max": 0.006291908444836736, "clip_ratio/high_mean": 0.0015288794047592091, "clip_ratio/low_mean": 0.0010521283829802996, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002581007756816689, "epoch": 0.11876417541156171, "grad_norm": 0.1070866584777832, "kl": 0.1232757568359375, "learning_rate": 1e-06, "loss": 0.0124, "step": 684 }, { "clip_ratio/high_max": 0.007158263724704739, "clip_ratio/high_mean": 0.00173023351271695, "clip_ratio/low_mean": 0.001349032693724439, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003079266272834502, "epoch": 0.11893780724695872, "grad_norm": 0.0956646054983139, "kl": 0.124542236328125, "learning_rate": 1e-06, "loss": 0.0121, "step": 685 }, { "clip_ratio/high_max": 0.008147985648975009, "clip_ratio/high_mean": 0.001974984907747057, "clip_ratio/low_mean": 0.0019071659417022602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003882150831486797, "epoch": 0.11911143908235575, "grad_norm": 0.10041145980358124, "kl": 0.1337432861328125, "learning_rate": 1e-06, "loss": 0.0119, "step": 686 }, { "clip_ratio/high_max": 0.009665767356636934, "clip_ratio/high_mean": 0.0024560353558626957, "clip_ratio/low_mean": 0.002199787408244447, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004655822824133793, "epoch": 0.11928507091775277, "grad_norm": 0.0962672159075737, "kl": 0.12994384765625, "learning_rate": 1e-06, "loss": 0.0116, "step": 687 }, { "clip_ratio/high_max": 0.010742996808403404, "clip_ratio/high_mean": 0.002683476160200371, "clip_ratio/low_mean": 0.0025180444381476264, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052015206711075734, "epoch": 0.11945870275314979, "grad_norm": 0.08573590219020844, "kl": 0.1287078857421875, "learning_rate": 1e-06, "loss": 0.0114, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2544642857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 1491.4219970703125, "completions/mean_terminated_length": 951.9431762695312, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.11963233458854682, "grad_norm": 0.11421704292297363, "kl": 0.13143157958984375, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 74691754.0, "reward": 0.3750000298023224, "reward_std": 0.18013805150985718, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "step": 689 }, { "clip_ratio/high_max": 0.0046173269183782395, "clip_ratio/high_mean": 0.0009881396949822374, "clip_ratio/low_mean": 0.0007808649511389376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017690046388452174, "epoch": 0.11980596642394382, "grad_norm": 0.10634800791740417, "kl": 0.1232452392578125, "learning_rate": 1e-06, "loss": 0.029, "step": 690 }, { "clip_ratio/high_max": 0.005140743120136904, "clip_ratio/high_mean": 0.0011118346064904472, "clip_ratio/low_mean": 0.0008264673849680548, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019383019980523386, "epoch": 0.11997959825934085, "grad_norm": 0.10419665277004242, "kl": 0.1226348876953125, "learning_rate": 1e-06, "loss": 0.029, "step": 691 }, { "clip_ratio/high_max": 0.005565622421272565, "clip_ratio/high_mean": 0.0012242660445735964, "clip_ratio/low_mean": 0.0010160587651171227, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002240324807644356, "epoch": 0.12015323009473787, "grad_norm": 0.10020323097705841, "kl": 0.121185302734375, "learning_rate": 1e-06, "loss": 0.0288, "step": 692 }, { "clip_ratio/high_max": 0.0066935581344296224, "clip_ratio/high_mean": 0.0014018436640981236, "clip_ratio/low_mean": 0.0013742301539423352, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027760738139477326, "epoch": 0.12032686193013489, "grad_norm": 0.08467567712068558, "kl": 0.12558746337890625, "learning_rate": 1e-06, "loss": 0.0285, "step": 693 }, { "clip_ratio/high_max": 0.0069776353557244875, "clip_ratio/high_mean": 0.0014614452638852526, "clip_ratio/low_mean": 0.00173080990816743, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031922551806928823, "epoch": 0.12050049376553192, "grad_norm": 0.0780162364244461, "kl": 0.13153839111328125, "learning_rate": 1e-06, "loss": 0.0283, "step": 694 }, { "clip_ratio/high_max": 0.008299885070300661, "clip_ratio/high_mean": 0.0017410374030077946, "clip_ratio/low_mean": 0.0021202442276262445, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003861281596982735, "epoch": 0.12067412560092892, "grad_norm": 0.07754679024219513, "kl": 0.13550567626953125, "learning_rate": 1e-06, "loss": 0.0281, "step": 695 }, { "clip_ratio/high_max": 0.00941979654453462, "clip_ratio/high_mean": 0.0020381682425067993, "clip_ratio/low_mean": 0.0025119276224359055, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004550095971353585, "epoch": 0.12084775743632595, "grad_norm": 0.07709571719169617, "kl": 0.13329315185546875, "learning_rate": 1e-06, "loss": 0.0279, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2834821428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 1550.2857666015625, "completions/mean_terminated_length": 948.2367553710938, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.12102138927172297, "grad_norm": 0.38890787959098816, "kl": 0.218597412109375, "learning_rate": 1e-06, "loss": 0.031, "num_tokens": 75441722.0, "reward": 0.3549107313156128, "reward_std": 0.23927921056747437, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211319923401, "step": 697 }, { "clip_ratio/high_max": 0.0069348020588222425, "clip_ratio/high_mean": 0.001662389817283838, "clip_ratio/low_mean": 0.0008448962430520623, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025072860344153014, "epoch": 0.12119502110712, "grad_norm": 0.24053968489170074, "kl": 0.1544342041015625, "learning_rate": 1e-06, "loss": 0.0435, "step": 698 }, { "clip_ratio/high_max": 0.00805846405273769, "clip_ratio/high_mean": 0.0019417112180235563, "clip_ratio/low_mean": 0.0009182746616716031, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028599858796951594, "epoch": 0.12136865294251702, "grad_norm": 0.13164150714874268, "kl": 0.14337158203125, "learning_rate": 1e-06, "loss": 0.0433, "step": 699 }, { "clip_ratio/high_max": 0.00920650816624402, "clip_ratio/high_mean": 0.002141445848337753, "clip_ratio/low_mean": 0.0011945454652959597, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00333599129180584, "epoch": 0.12154228477791403, "grad_norm": 0.13187456130981445, "kl": 0.1407012939453125, "learning_rate": 1e-06, "loss": 0.043, "step": 700 }, { "clip_ratio/high_max": 0.010147226530534681, "clip_ratio/high_mean": 0.002419720156922267, "clip_ratio/low_mean": 0.0014563816994268564, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038761018822697224, "epoch": 0.12171591661331105, "grad_norm": 0.13244366645812988, "kl": 0.14111328125, "learning_rate": 1e-06, "loss": 0.0427, "step": 701 }, { "clip_ratio/high_max": 0.011356715876900125, "clip_ratio/high_mean": 0.0027609157018559927, "clip_ratio/low_mean": 0.001797576451281202, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004558492193609709, "epoch": 0.12188954844870807, "grad_norm": 0.7862778902053833, "kl": 0.138092041015625, "learning_rate": 1e-06, "loss": 0.0416, "step": 702 }, { "clip_ratio/high_max": 0.012416428973665461, "clip_ratio/high_mean": 0.002860062460968038, "clip_ratio/low_mean": 0.0023270483152373345, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005187110780752846, "epoch": 0.1220631802841051, "grad_norm": 1.0490303039550781, "kl": 0.321014404296875, "learning_rate": 1e-06, "loss": 0.0298, "step": 703 }, { "clip_ratio/high_max": 0.014272513180912938, "clip_ratio/high_mean": 0.0032632551474307547, "clip_ratio/low_mean": 0.002577783345259377, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005841038411745103, "epoch": 0.12223681211950212, "grad_norm": 0.09663736075162888, "kl": 0.155242919921875, "learning_rate": 1e-06, "loss": 0.0294, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2834821428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 1496.3929443359375, "completions/mean_terminated_length": 873.0217895507812, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.12241044395489913, "grad_norm": 0.1501658856868744, "kl": 0.141326904296875, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 76174530.0, "reward": 0.3593750298023224, "reward_std": 0.18577386438846588, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "step": 705 }, { "clip_ratio/high_max": 0.006962971070606727, "clip_ratio/high_mean": 0.0014166955866130593, "clip_ratio/low_mean": 0.0006573294685949804, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020740250797643967, "epoch": 0.12258407579029615, "grad_norm": 0.11895106732845306, "kl": 0.115570068359375, "learning_rate": 1e-06, "loss": 0.0185, "step": 706 }, { "clip_ratio/high_max": 0.008747992396820337, "clip_ratio/high_mean": 0.001838640342157305, "clip_ratio/low_mean": 0.0007558818385859922, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025945221832444076, "epoch": 0.12275770762569317, "grad_norm": 0.12957994639873505, "kl": 0.10756683349609375, "learning_rate": 1e-06, "loss": 0.0183, "step": 707 }, { "clip_ratio/high_max": 0.010042324058304075, "clip_ratio/high_mean": 0.002002126363095158, "clip_ratio/low_mean": 0.0010318484232811898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030339747727339272, "epoch": 0.1229313394610902, "grad_norm": 0.12528975307941437, "kl": 0.10546875, "learning_rate": 1e-06, "loss": 0.018, "step": 708 }, { "clip_ratio/high_max": 0.01178464868280571, "clip_ratio/high_mean": 0.0024170236119971378, "clip_ratio/low_mean": 0.0011753444166515692, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003592367905184801, "epoch": 0.12310497129648722, "grad_norm": 0.10864225029945374, "kl": 0.10509490966796875, "learning_rate": 1e-06, "loss": 0.0177, "step": 709 }, { "clip_ratio/high_max": 0.013163732503016945, "clip_ratio/high_mean": 0.00263474606708769, "clip_ratio/low_mean": 0.0014269993785092083, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004061745439685183, "epoch": 0.12327860313188423, "grad_norm": 0.09704262763261795, "kl": 0.110076904296875, "learning_rate": 1e-06, "loss": 0.0174, "step": 710 }, { "clip_ratio/high_max": 0.014027863755472936, "clip_ratio/high_mean": 0.002790560381981777, "clip_ratio/low_mean": 0.0018398401357444527, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00463040058457409, "epoch": 0.12345223496728125, "grad_norm": 0.08610663563013077, "kl": 0.1158905029296875, "learning_rate": 1e-06, "loss": 0.0171, "step": 711 }, { "clip_ratio/high_max": 0.016076147971034516, "clip_ratio/high_mean": 0.003199234279236407, "clip_ratio/low_mean": 0.0023950455733938725, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00559427989719552, "epoch": 0.12362586680267827, "grad_norm": 0.10939353704452515, "kl": 0.1253204345703125, "learning_rate": 1e-06, "loss": 0.0169, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2075892857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 1365.2098388671875, "completions/mean_terminated_length": 918.078857421875, "completions/min_length": 292.0, "completions/min_terminated_length": 292.0, "epoch": 0.1237994986380753, "grad_norm": 0.16894873976707458, "kl": 0.2320404052734375, "learning_rate": 1e-06, "loss": 0.0441, "num_tokens": 76842472.0, "reward": 0.4888392984867096, "reward_std": 0.23357054591178894, "rewards/accuracy_reward/mean": 0.4888392984867096, "rewards/accuracy_reward/std": 0.5004342198371887, "step": 713 }, { "clip_ratio/high_max": 0.004495662840781733, "clip_ratio/high_mean": 0.0011231962607780588, "clip_ratio/low_mean": 0.0008979003387139528, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020210965849400964, "epoch": 0.12397313047347232, "grad_norm": 0.11475851386785507, "kl": 0.1763458251953125, "learning_rate": 1e-06, "loss": 0.044, "step": 714 }, { "clip_ratio/high_max": 0.005228757043369114, "clip_ratio/high_mean": 0.0013599584781331941, "clip_ratio/low_mean": 0.0010763802486053464, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024363386764889583, "epoch": 0.12414676230886933, "grad_norm": 0.10420625656843185, "kl": 0.15771484375, "learning_rate": 1e-06, "loss": 0.0439, "step": 715 }, { "clip_ratio/high_max": 0.006901336924784118, "clip_ratio/high_mean": 0.0017729649298416916, "clip_ratio/low_mean": 0.0010361816907789034, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028091466110709007, "epoch": 0.12432039414426635, "grad_norm": 0.11434406042098999, "kl": 0.130584716796875, "learning_rate": 1e-06, "loss": 0.0436, "step": 716 }, { "clip_ratio/high_max": 0.007049434236250818, "clip_ratio/high_mean": 0.0018150281930502388, "clip_ratio/low_mean": 0.0013353989997995086, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003150427192395, "epoch": 0.12449402597966337, "grad_norm": 0.09684614837169647, "kl": 0.12969970703125, "learning_rate": 1e-06, "loss": 0.0434, "step": 717 }, { "clip_ratio/high_max": 0.007915874644822907, "clip_ratio/high_mean": 0.0019966914842370898, "clip_ratio/low_mean": 0.001576175103764399, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035728665361602907, "epoch": 0.1246676578150604, "grad_norm": 0.0860871747136116, "kl": 0.1297607421875, "learning_rate": 1e-06, "loss": 0.0432, "step": 718 }, { "clip_ratio/high_max": 0.009139117479207925, "clip_ratio/high_mean": 0.0023147284041442617, "clip_ratio/low_mean": 0.0019998133534500084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0043145417930645635, "epoch": 0.12484128965045742, "grad_norm": 0.07981272041797638, "kl": 0.13137054443359375, "learning_rate": 1e-06, "loss": 0.0429, "step": 719 }, { "clip_ratio/high_max": 0.010052810626802966, "clip_ratio/high_mean": 0.0025589274637241033, "clip_ratio/low_mean": 0.002318471337275696, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0048773987300592125, "epoch": 0.12501492148585444, "grad_norm": 0.08202718198299408, "kl": 0.1322479248046875, "learning_rate": 1e-06, "loss": 0.0427, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3482142857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3069.0, "completions/mean_length": 1742.774658203125, "completions/mean_terminated_length": 1032.640380859375, "completions/min_length": 331.0, "completions/min_terminated_length": 331.0, "epoch": 0.12518855332125145, "grad_norm": 95.47449493408203, "kl": 33.66581726074219, "learning_rate": 1e-06, "loss": 0.0702, "num_tokens": 77684323.0, "reward": 0.3236607313156128, "reward_std": 0.22334784269332886, "rewards/accuracy_reward/mean": 0.3236607015132904, "rewards/accuracy_reward/std": 0.46839529275894165, "step": 721 }, { "clip_ratio/high_max": 0.005491249099577544, "clip_ratio/high_mean": 0.0013233897007012274, "clip_ratio/low_mean": 0.0006321765040411265, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001955566222022753, "epoch": 0.12536218515664846, "grad_norm": 0.8378905057907104, "kl": 0.6124420166015625, "learning_rate": 1e-06, "loss": 0.0373, "step": 722 }, { "clip_ratio/high_max": 0.008675079436216038, "clip_ratio/high_mean": 0.002019861227381625, "clip_ratio/low_mean": 0.000645912222807965, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026657734506443376, "epoch": 0.1255358169920455, "grad_norm": 0.146189883351326, "kl": 0.2061920166015625, "learning_rate": 1e-06, "loss": 0.037, "step": 723 }, { "clip_ratio/high_max": 0.010253044041746762, "clip_ratio/high_mean": 0.002341299841646105, "clip_ratio/low_mean": 0.0008092760667750554, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00315057593252277, "epoch": 0.1257094488274425, "grad_norm": 1.6865659952163696, "kl": 0.8098678588867188, "learning_rate": 1e-06, "loss": 0.0375, "step": 724 }, { "clip_ratio/high_max": 0.012498605843575206, "clip_ratio/high_mean": 0.002812539787555579, "clip_ratio/low_mean": 0.0010072250734083354, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003819764890067745, "epoch": 0.12588308066283954, "grad_norm": 0.5360269546508789, "kl": 0.14156341552734375, "learning_rate": 1e-06, "loss": 0.037, "step": 725 }, { "clip_ratio/high_max": 0.012714248900010716, "clip_ratio/high_mean": 0.002829847102475469, "clip_ratio/low_mean": 0.0013582092124124756, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00418805626759422, "epoch": 0.12605671249823655, "grad_norm": 0.15322738885879517, "kl": 0.14122772216796875, "learning_rate": 1e-06, "loss": 0.0366, "step": 726 }, { "clip_ratio/high_max": 0.012865681965195108, "clip_ratio/high_mean": 0.002851028128134203, "clip_ratio/low_mean": 0.0015604834907207987, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004411511581565719, "epoch": 0.12623034433363356, "grad_norm": 0.15269990265369415, "kl": 0.17772674560546875, "learning_rate": 1e-06, "loss": 0.0365, "step": 727 }, { "clip_ratio/high_max": 0.01399625871272292, "clip_ratio/high_mean": 0.0031008590558485594, "clip_ratio/low_mean": 0.0017638269910094095, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0048646860450389795, "epoch": 0.1264039761690306, "grad_norm": 0.18279065191745758, "kl": 0.2004852294921875, "learning_rate": 1e-06, "loss": 0.0362, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3504464285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 2926.0, "completions/mean_length": 1678.216552734375, "completions/mean_terminated_length": 926.2440185546875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.1265776080044276, "grad_norm": 0.1309364140033722, "kl": 0.12542724609375, "learning_rate": 1e-06, "loss": 0.0631, "num_tokens": 78505284.0, "reward": 0.4419642984867096, "reward_std": 0.26611489057540894, "rewards/accuracy_reward/mean": 0.4419642984867096, "rewards/accuracy_reward/std": 0.4971756041049957, "step": 729 }, { "clip_ratio/high_max": 0.005298067037074361, "clip_ratio/high_mean": 0.001267690719487291, "clip_ratio/low_mean": 0.0013250354234060069, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002592726184957428, "epoch": 0.12675123983982464, "grad_norm": 0.12342376261949539, "kl": 0.1439208984375, "learning_rate": 1e-06, "loss": 0.0632, "step": 730 }, { "clip_ratio/high_max": 0.005673517764080316, "clip_ratio/high_mean": 0.0013875372014808818, "clip_ratio/low_mean": 0.001801178571440687, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031887157620076323, "epoch": 0.12692487167522165, "grad_norm": 0.11891209334135056, "kl": 0.1468658447265625, "learning_rate": 1e-06, "loss": 0.0631, "step": 731 }, { "clip_ratio/high_max": 0.006186639420775464, "clip_ratio/high_mean": 0.001523271781479707, "clip_ratio/low_mean": 0.0023010059348962386, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038242777045525145, "epoch": 0.12709850351061866, "grad_norm": 0.13280156254768372, "kl": 0.150726318359375, "learning_rate": 1e-06, "loss": 0.0629, "step": 732 }, { "clip_ratio/high_max": 0.008348187446244992, "clip_ratio/high_mean": 0.002080268188365153, "clip_ratio/low_mean": 0.0021946583065073355, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004274926450307248, "epoch": 0.1272721353460157, "grad_norm": 0.11344350129365921, "kl": 0.144500732421875, "learning_rate": 1e-06, "loss": 0.0625, "step": 733 }, { "clip_ratio/high_max": 0.009099416856770404, "clip_ratio/high_mean": 0.0022062304942664923, "clip_ratio/low_mean": 0.0025519328610243974, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004758163406222593, "epoch": 0.1274457671814127, "grad_norm": 0.10219965875148773, "kl": 0.1414794921875, "learning_rate": 1e-06, "loss": 0.0623, "step": 734 }, { "clip_ratio/high_max": 0.01169252977706492, "clip_ratio/high_mean": 0.002847526238838327, "clip_ratio/low_mean": 0.002433237647437636, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005280763893097173, "epoch": 0.12761939901680974, "grad_norm": 0.09208756685256958, "kl": 0.1287841796875, "learning_rate": 1e-06, "loss": 0.0619, "step": 735 }, { "clip_ratio/high_max": 0.013468556484440342, "clip_ratio/high_mean": 0.003164241941703949, "clip_ratio/low_mean": 0.002865147862394224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006029389725881629, "epoch": 0.12779303085220675, "grad_norm": 0.09670844674110413, "kl": 0.12689208984375, "learning_rate": 1e-06, "loss": 0.0616, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 3072.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 1298.5960693359375, "completions/mean_terminated_length": 889.3489379882812, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.12796666268760376, "grad_norm": 0.13283665478229523, "kl": 0.1629486083984375, "learning_rate": 1e-06, "loss": 0.0265, "num_tokens": 79150423.0, "reward": 0.4620535969734192, "reward_std": 0.23289792239665985, "rewards/accuracy_reward/mean": 0.4620535671710968, "rewards/accuracy_reward/std": 0.49911534786224365, "step": 737 }, { "clip_ratio/high_max": 0.006343984481645748, "clip_ratio/high_mean": 0.0018671928128242143, "clip_ratio/low_mean": 0.0007128210572773241, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002580013930128189, "epoch": 0.1281402945230008, "grad_norm": 0.1368260681629181, "kl": 0.14404296875, "learning_rate": 1e-06, "loss": 0.0267, "step": 738 }, { "clip_ratio/high_max": 0.0061233280939632095, "clip_ratio/high_mean": 0.0017131887098003062, "clip_ratio/low_mean": 0.0008509023436999996, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002564091028034454, "epoch": 0.1283139263583978, "grad_norm": 0.12014421075582504, "kl": 0.1569976806640625, "learning_rate": 1e-06, "loss": 0.0264, "step": 739 }, { "clip_ratio/high_max": 0.008260679718659958, "clip_ratio/high_mean": 0.0023371004399450612, "clip_ratio/low_mean": 0.0009612994253984652, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032983998862619046, "epoch": 0.12848755819379484, "grad_norm": 0.12066115438938141, "kl": 0.13677978515625, "learning_rate": 1e-06, "loss": 0.0262, "step": 740 }, { "clip_ratio/high_max": 0.009290305908507435, "clip_ratio/high_mean": 0.0024773400900812703, "clip_ratio/low_mean": 0.0013516980470740236, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038290381198748946, "epoch": 0.12866119002919185, "grad_norm": 0.10921292752027512, "kl": 0.1396331787109375, "learning_rate": 1e-06, "loss": 0.0258, "step": 741 }, { "clip_ratio/high_max": 0.009831442199356388, "clip_ratio/high_mean": 0.002581980677859974, "clip_ratio/low_mean": 0.001717062397801783, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004299043168430217, "epoch": 0.12883482186458886, "grad_norm": 0.10186201333999634, "kl": 0.14495849609375, "learning_rate": 1e-06, "loss": 0.0256, "step": 742 }, { "clip_ratio/high_max": 0.010555028595263138, "clip_ratio/high_mean": 0.0027536576108104782, "clip_ratio/low_mean": 0.0020543668542813975, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004808024470548844, "epoch": 0.1290084536999859, "grad_norm": 0.1037551611661911, "kl": 0.1521759033203125, "learning_rate": 1e-06, "loss": 0.0253, "step": 743 }, { "clip_ratio/high_max": 0.01101310573722003, "clip_ratio/high_mean": 0.002885603840695694, "clip_ratio/low_mean": 0.002768404296148219, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005654008120473009, "epoch": 0.1291820855353829, "grad_norm": 0.09536939114332199, "kl": 0.1596221923828125, "learning_rate": 1e-06, "loss": 0.025, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2879464285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 1642.5938720703125, "completions/mean_terminated_length": 1064.5579833984375, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.12935571737077994, "grad_norm": 0.45722898840904236, "kl": 0.4381866455078125, "learning_rate": 1e-06, "loss": 0.025, "num_tokens": 79955929.0, "reward": 0.3705357313156128, "reward_std": 0.2368021458387375, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.48348814249038696, "step": 745 }, { "clip_ratio/high_max": 0.004573592988890596, "clip_ratio/high_mean": 0.0011984563070654985, "clip_ratio/low_mean": 0.0009575960384609061, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002156052356440341, "epoch": 0.12952934920617695, "grad_norm": 0.15756018459796906, "kl": 0.1978607177734375, "learning_rate": 1e-06, "loss": 0.0248, "step": 746 }, { "clip_ratio/high_max": 0.005928724393015727, "clip_ratio/high_mean": 0.001501344657754089, "clip_ratio/low_mean": 0.0011127148372906959, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026140595218748786, "epoch": 0.12970298104157396, "grad_norm": 0.14603769779205322, "kl": 0.1498565673828125, "learning_rate": 1e-06, "loss": 0.0246, "step": 747 }, { "clip_ratio/high_max": 0.007099462349287933, "clip_ratio/high_mean": 0.001744119446584591, "clip_ratio/low_mean": 0.0012069526828781818, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029510721487895353, "epoch": 0.129876612876971, "grad_norm": 0.15315808355808258, "kl": 0.1196136474609375, "learning_rate": 1e-06, "loss": 0.0245, "step": 748 }, { "clip_ratio/high_max": 0.007422773487633094, "clip_ratio/high_mean": 0.0017583197682142782, "clip_ratio/low_mean": 0.0014622618664361653, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032205816205532756, "epoch": 0.130050244712368, "grad_norm": 0.11458150297403336, "kl": 0.1201171875, "learning_rate": 1e-06, "loss": 0.0242, "step": 749 }, { "clip_ratio/high_max": 0.007617628807565779, "clip_ratio/high_mean": 0.0017450814211770194, "clip_ratio/low_mean": 0.0017108333740907256, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034559148662083317, "epoch": 0.13022387654776504, "grad_norm": 0.08625411987304688, "kl": 0.123046875, "learning_rate": 1e-06, "loss": 0.0239, "step": 750 }, { "clip_ratio/high_max": 0.009523135937342886, "clip_ratio/high_mean": 0.0022513792941936117, "clip_ratio/low_mean": 0.0018647776241778047, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0041161569897667505, "epoch": 0.13039750838316205, "grad_norm": 0.0941966250538826, "kl": 0.1253204345703125, "learning_rate": 1e-06, "loss": 0.0237, "step": 751 }, { "clip_ratio/high_max": 0.011234143523324747, "clip_ratio/high_mean": 0.002570892012499826, "clip_ratio/low_mean": 0.0022826735066701076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004853565535086091, "epoch": 0.13057114021855906, "grad_norm": 0.11103711277246475, "kl": 0.124542236328125, "learning_rate": 1e-06, "loss": 0.0235, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2477678571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 1455.1473388671875, "completions/mean_terminated_length": 922.5934448242188, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.1307447720539561, "grad_norm": 0.11840981245040894, "kl": 0.1287994384765625, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 80666947.0, "reward": 0.4508928656578064, "reward_std": 0.1965171843767166, "rewards/accuracy_reward/mean": 0.4508928656578064, "rewards/accuracy_reward/std": 0.4981389045715332, "step": 753 }, { "clip_ratio/high_max": 0.004649448415875668, "clip_ratio/high_mean": 0.0010873288479160692, "clip_ratio/low_mean": 0.0008439560658644041, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019312848926347215, "epoch": 0.1309184038893531, "grad_norm": 0.11041787266731262, "kl": 0.127288818359375, "learning_rate": 1e-06, "loss": 0.0103, "step": 754 }, { "clip_ratio/high_max": 0.00442522055527661, "clip_ratio/high_mean": 0.0010036486614808382, "clip_ratio/low_mean": 0.0011264078718795645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002130056542227976, "epoch": 0.13109203572475014, "grad_norm": 0.10280881822109222, "kl": 0.13446044921875, "learning_rate": 1e-06, "loss": 0.0101, "step": 755 }, { "clip_ratio/high_max": 0.0054096184685477056, "clip_ratio/high_mean": 0.0012299612162678386, "clip_ratio/low_mean": 0.0013746469862780941, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026046081766253337, "epoch": 0.13126566756014715, "grad_norm": 0.10283561050891876, "kl": 0.1260528564453125, "learning_rate": 1e-06, "loss": 0.0098, "step": 756 }, { "clip_ratio/high_max": 0.006690664267807733, "clip_ratio/high_mean": 0.0014905456546330242, "clip_ratio/low_mean": 0.0016840761686580663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003174621780999587, "epoch": 0.13143929939554416, "grad_norm": 0.09554052352905273, "kl": 0.129669189453125, "learning_rate": 1e-06, "loss": 0.0096, "step": 757 }, { "clip_ratio/high_max": 0.00774639211158501, "clip_ratio/high_mean": 0.0016978389548967243, "clip_ratio/low_mean": 0.0019049457655455626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003602784758186317, "epoch": 0.1316129312309412, "grad_norm": 0.08958891034126282, "kl": 0.1281585693359375, "learning_rate": 1e-06, "loss": 0.0094, "step": 758 }, { "clip_ratio/high_max": 0.009928208790370263, "clip_ratio/high_mean": 0.0022186937558217323, "clip_ratio/low_mean": 0.0019335695899371785, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004152263350988505, "epoch": 0.1317865630663382, "grad_norm": 0.08464101701974869, "kl": 0.121185302734375, "learning_rate": 1e-06, "loss": 0.0091, "step": 759 }, { "clip_ratio/high_max": 0.011175720414030366, "clip_ratio/high_mean": 0.0024917394257499836, "clip_ratio/low_mean": 0.0023706597944510577, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004862399227931746, "epoch": 0.13196019490173524, "grad_norm": 0.07568498700857162, "kl": 0.1269378662109375, "learning_rate": 1e-06, "loss": 0.0089, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3147321428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 1671.075927734375, "completions/mean_terminated_length": 1027.6546630859375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.13213382673713225, "grad_norm": 0.14492802321910858, "kl": 0.1734619140625, "learning_rate": 1e-06, "loss": 0.0387, "num_tokens": 81487493.0, "reward": 0.392857164144516, "reward_std": 0.24333171546459198, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "step": 761 }, { "clip_ratio/high_max": 0.005856885753019014, "clip_ratio/high_mean": 0.0015062489092088072, "clip_ratio/low_mean": 0.0007270924114664012, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002233341354440199, "epoch": 0.13230745857252926, "grad_norm": 0.11569356173276901, "kl": 0.15943145751953125, "learning_rate": 1e-06, "loss": 0.0388, "step": 762 }, { "clip_ratio/high_max": 0.00812840483013133, "clip_ratio/high_mean": 0.002007959056754771, "clip_ratio/low_mean": 0.0007324493919895758, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027404083957662806, "epoch": 0.1324810904079263, "grad_norm": 0.11656951159238815, "kl": 0.14580535888671875, "learning_rate": 1e-06, "loss": 0.0386, "step": 763 }, { "clip_ratio/high_max": 0.008859012757966411, "clip_ratio/high_mean": 0.0022125094874354545, "clip_ratio/low_mean": 0.0009666368132457137, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031791462424735073, "epoch": 0.1326547222433233, "grad_norm": 0.10464467108249664, "kl": 0.14444732666015625, "learning_rate": 1e-06, "loss": 0.0383, "step": 764 }, { "clip_ratio/high_max": 0.011797994600783568, "clip_ratio/high_mean": 0.0029322627324290806, "clip_ratio/low_mean": 0.0010518755659632006, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003984138231317047, "epoch": 0.13282835407872035, "grad_norm": 0.11386299133300781, "kl": 0.137542724609375, "learning_rate": 1e-06, "loss": 0.0381, "step": 765 }, { "clip_ratio/high_max": 0.01271689905115636, "clip_ratio/high_mean": 0.003153451087200665, "clip_ratio/low_mean": 0.0014146028527193266, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004568053878756473, "epoch": 0.13300198591411735, "grad_norm": 0.1135941743850708, "kl": 0.13573455810546875, "learning_rate": 1e-06, "loss": 0.0378, "step": 766 }, { "clip_ratio/high_max": 0.013573938733316027, "clip_ratio/high_mean": 0.0033609861120567075, "clip_ratio/low_mean": 0.0018509422361603356, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005211928382777842, "epoch": 0.13317561774951436, "grad_norm": 0.1091289296746254, "kl": 0.15374755859375, "learning_rate": 1e-06, "loss": 0.0374, "step": 767 }, { "clip_ratio/high_max": 0.015075512972543947, "clip_ratio/high_mean": 0.003729324267624179, "clip_ratio/low_mean": 0.002286776759319764, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006016101029672427, "epoch": 0.1333492495849114, "grad_norm": 0.13516508042812347, "kl": 0.162933349609375, "learning_rate": 1e-06, "loss": 0.0372, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2544642857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 2905.0, "completions/mean_length": 1550.4285888671875, "completions/mean_terminated_length": 1031.08984375, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "epoch": 0.1335228814203084, "grad_norm": 0.13671624660491943, "kl": 0.1623687744140625, "learning_rate": 1e-06, "loss": 0.0402, "num_tokens": 82241717.0, "reward": 0.4441964626312256, "reward_std": 0.24092385172843933, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "step": 769 }, { "clip_ratio/high_max": 0.008187312174413819, "clip_ratio/high_mean": 0.0020215118438500213, "clip_ratio/low_mean": 0.0006114596976658504, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026329715255997144, "epoch": 0.13369651325570545, "grad_norm": 0.13070477545261383, "kl": 0.1295318603515625, "learning_rate": 1e-06, "loss": 0.0402, "step": 770 }, { "clip_ratio/high_max": 0.0075513925912673585, "clip_ratio/high_mean": 0.0019373290433577495, "clip_ratio/low_mean": 0.0007601992490435805, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026975282817147672, "epoch": 0.13387014509110245, "grad_norm": 0.11209924519062042, "kl": 0.1289825439453125, "learning_rate": 1e-06, "loss": 0.04, "step": 771 }, { "clip_ratio/high_max": 0.008261256603873335, "clip_ratio/high_mean": 0.002132965746568516, "clip_ratio/low_mean": 0.0009522805166852777, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003085246262344299, "epoch": 0.13404377692649946, "grad_norm": 0.09769391268491745, "kl": 0.1280975341796875, "learning_rate": 1e-06, "loss": 0.0398, "step": 772 }, { "clip_ratio/high_max": 0.00814004960557213, "clip_ratio/high_mean": 0.0020091483247597353, "clip_ratio/low_mean": 0.001406879372552794, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003416027651837794, "epoch": 0.1342174087618965, "grad_norm": 0.09594806283712387, "kl": 0.1412811279296875, "learning_rate": 1e-06, "loss": 0.0395, "step": 773 }, { "clip_ratio/high_max": 0.010047061572549865, "clip_ratio/high_mean": 0.0024380624545301544, "clip_ratio/low_mean": 0.0015388888659799704, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003976951265940443, "epoch": 0.1343910405972935, "grad_norm": 0.09834489971399307, "kl": 0.1396484375, "learning_rate": 1e-06, "loss": 0.0393, "step": 774 }, { "clip_ratio/high_max": 0.011893805713043548, "clip_ratio/high_mean": 0.002864668520487612, "clip_ratio/low_mean": 0.0017288978488068096, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004593566347466549, "epoch": 0.13456467243269055, "grad_norm": 0.08642339706420898, "kl": 0.1357574462890625, "learning_rate": 1e-06, "loss": 0.0391, "step": 775 }, { "clip_ratio/high_max": 0.012467531531001441, "clip_ratio/high_mean": 0.0029890138594055315, "clip_ratio/low_mean": 0.002180741936172126, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005169755844690371, "epoch": 0.13473830426808756, "grad_norm": 0.0845668837428093, "kl": 0.1384429931640625, "learning_rate": 1e-06, "loss": 0.0389, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2276785714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 1375.5960693359375, "completions/mean_terminated_length": 875.5, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "epoch": 0.13491193610348456, "grad_norm": 0.13680359721183777, "kl": 0.129974365234375, "learning_rate": 1e-06, "loss": 0.0441, "num_tokens": 82915056.0, "reward": 0.4441964626312256, "reward_std": 0.2777610719203949, "rewards/accuracy_reward/mean": 0.4441964328289032, "rewards/accuracy_reward/std": 0.4974316358566284, "step": 777 }, { "clip_ratio/high_max": 0.007251445444126148, "clip_ratio/high_mean": 0.0019971221067862643, "clip_ratio/low_mean": 0.0008049924226725125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002802114588121185, "epoch": 0.1350855679388816, "grad_norm": 0.1495600789785385, "kl": 0.1193084716796875, "learning_rate": 1e-06, "loss": 0.0441, "step": 778 }, { "clip_ratio/high_max": 0.00836967611030559, "clip_ratio/high_mean": 0.0021343678126868326, "clip_ratio/low_mean": 0.0010306947160643176, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031650625314796343, "epoch": 0.1352591997742786, "grad_norm": 0.13852651417255402, "kl": 0.1223297119140625, "learning_rate": 1e-06, "loss": 0.0438, "step": 779 }, { "clip_ratio/high_max": 0.00992202052657376, "clip_ratio/high_mean": 0.0026549288550086203, "clip_ratio/low_mean": 0.0010335633128306654, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036884922337776516, "epoch": 0.13543283160967565, "grad_norm": 0.13246159255504608, "kl": 0.1200408935546875, "learning_rate": 1e-06, "loss": 0.0435, "step": 780 }, { "clip_ratio/high_max": 0.009930613174219616, "clip_ratio/high_mean": 0.0027537203750398476, "clip_ratio/low_mean": 0.0014182142822392052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004171934680925915, "epoch": 0.13560646344507266, "grad_norm": 0.11637328565120697, "kl": 0.1275177001953125, "learning_rate": 1e-06, "loss": 0.0432, "step": 781 }, { "clip_ratio/high_max": 0.010311669950169744, "clip_ratio/high_mean": 0.0027619637176030665, "clip_ratio/low_mean": 0.002116714476414927, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004878678228124045, "epoch": 0.13578009528046966, "grad_norm": 0.14905379712581635, "kl": 0.150390625, "learning_rate": 1e-06, "loss": 0.0429, "step": 782 }, { "clip_ratio/high_max": 0.011935563452425413, "clip_ratio/high_mean": 0.003173705659719417, "clip_ratio/low_mean": 0.0026466057261131937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0058203113876516, "epoch": 0.1359537271158667, "grad_norm": 0.11793313920497894, "kl": 0.138641357421875, "learning_rate": 1e-06, "loss": 0.0425, "step": 783 }, { "clip_ratio/high_max": 0.012871405306214001, "clip_ratio/high_mean": 0.0033223442269445513, "clip_ratio/low_mean": 0.0032433177398161206, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0065656619117362425, "epoch": 0.1361273589512637, "grad_norm": 0.10625766217708588, "kl": 0.1405487060546875, "learning_rate": 1e-06, "loss": 0.0422, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2633928571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 2953.0, "completions/mean_length": 1519.5001220703125, "completions/mean_terminated_length": 964.3635864257812, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.13630099078666075, "grad_norm": 0.12400048226118088, "kl": 0.172882080078125, "learning_rate": 1e-06, "loss": 0.0261, "num_tokens": 83657536.0, "reward": 0.3593750298023224, "reward_std": 0.20410941541194916, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.4803536534309387, "step": 785 }, { "clip_ratio/high_max": 0.0053091306654096115, "clip_ratio/high_mean": 0.0012938557688357832, "clip_ratio/low_mean": 0.0007621444931373844, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020560002740239725, "epoch": 0.13647462262205776, "grad_norm": 0.12006939947605133, "kl": 0.17205810546875, "learning_rate": 1e-06, "loss": 0.0261, "step": 786 }, { "clip_ratio/high_max": 0.006039867748768302, "clip_ratio/high_mean": 0.0014505109493256896, "clip_ratio/low_mean": 0.0008771552240887104, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023276661995623726, "epoch": 0.13664825445745477, "grad_norm": 0.1165611743927002, "kl": 0.168426513671875, "learning_rate": 1e-06, "loss": 0.0259, "step": 787 }, { "clip_ratio/high_max": 0.0078955468343338, "clip_ratio/high_mean": 0.0018463766509739798, "clip_ratio/low_mean": 0.0009737514110383927, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028201280656503513, "epoch": 0.1368218862928518, "grad_norm": 0.1071123406291008, "kl": 0.1580047607421875, "learning_rate": 1e-06, "loss": 0.0256, "step": 788 }, { "clip_ratio/high_max": 0.009773234807653353, "clip_ratio/high_mean": 0.0022602539393119514, "clip_ratio/low_mean": 0.001093944739750441, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003354198681336129, "epoch": 0.1369955181282488, "grad_norm": 0.1106075569987297, "kl": 0.14752197265625, "learning_rate": 1e-06, "loss": 0.0253, "step": 789 }, { "clip_ratio/high_max": 0.010188379739702214, "clip_ratio/high_mean": 0.0022605071444559144, "clip_ratio/low_mean": 0.0017392592808391782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003999766373453895, "epoch": 0.13716914996364585, "grad_norm": 0.08936687558889389, "kl": 0.1597442626953125, "learning_rate": 1e-06, "loss": 0.0251, "step": 790 }, { "clip_ratio/high_max": 0.0128181981126545, "clip_ratio/high_mean": 0.0028415783472155454, "clip_ratio/low_mean": 0.001989368375234335, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00483094666924444, "epoch": 0.13734278179904286, "grad_norm": 0.08778053522109985, "kl": 0.15283203125, "learning_rate": 1e-06, "loss": 0.0248, "step": 791 }, { "clip_ratio/high_max": 0.014708967268234119, "clip_ratio/high_mean": 0.0032698397371859755, "clip_ratio/low_mean": 0.0025156756341857545, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005785515415482223, "epoch": 0.13751641363443987, "grad_norm": 0.09349148720502853, "kl": 0.152801513671875, "learning_rate": 1e-06, "loss": 0.0246, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2790178571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 3064.0, "completions/mean_length": 1567.4888916015625, "completions/mean_terminated_length": 985.2476806640625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.1376900454698369, "grad_norm": 0.1312902420759201, "kl": 0.162078857421875, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 84422867.0, "reward": 0.3058035969734192, "reward_std": 0.18885572254657745, "rewards/accuracy_reward/mean": 0.3058035671710968, "rewards/accuracy_reward/std": 0.4612620174884796, "step": 793 }, { "clip_ratio/high_max": 0.0057858891268551815, "clip_ratio/high_mean": 0.0013329232160685933, "clip_ratio/low_mean": 0.0006566267627476918, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001989549997233553, "epoch": 0.1378636773052339, "grad_norm": 0.1143469512462616, "kl": 0.1548919677734375, "learning_rate": 1e-06, "loss": 0.0153, "step": 794 }, { "clip_ratio/high_max": 0.007636122008989332, "clip_ratio/high_mean": 0.0016501042855452397, "clip_ratio/low_mean": 0.0008192631332804012, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002469367425874225, "epoch": 0.13803730914063095, "grad_norm": 0.1056492030620575, "kl": 0.1630401611328125, "learning_rate": 1e-06, "loss": 0.0151, "step": 795 }, { "clip_ratio/high_max": 0.009374222921906039, "clip_ratio/high_mean": 0.001954168689735525, "clip_ratio/low_mean": 0.0009209676452428539, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028751363315677736, "epoch": 0.13821094097602796, "grad_norm": 0.10034377127885818, "kl": 0.1496124267578125, "learning_rate": 1e-06, "loss": 0.0149, "step": 796 }, { "clip_ratio/high_max": 0.010722410570451757, "clip_ratio/high_mean": 0.002228019099220546, "clip_ratio/low_mean": 0.0011738782482098031, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034018973565252963, "epoch": 0.13838457281142497, "grad_norm": 0.09401454776525497, "kl": 0.1467132568359375, "learning_rate": 1e-06, "loss": 0.0147, "step": 797 }, { "clip_ratio/high_max": 0.01174796722989413, "clip_ratio/high_mean": 0.0023529294599029527, "clip_ratio/low_mean": 0.0015248459594658925, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003877775394357741, "epoch": 0.138558204646822, "grad_norm": 0.08500724285840988, "kl": 0.1467132568359375, "learning_rate": 1e-06, "loss": 0.0144, "step": 798 }, { "clip_ratio/high_max": 0.014157428857288323, "clip_ratio/high_mean": 0.002994827446855197, "clip_ratio/low_mean": 0.001749887520077209, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004744714929984184, "epoch": 0.138731836482219, "grad_norm": 0.08257627487182617, "kl": 0.1411285400390625, "learning_rate": 1e-06, "loss": 0.0142, "step": 799 }, { "clip_ratio/high_max": 0.014512304231175222, "clip_ratio/high_mean": 0.0030440513301073224, "clip_ratio/low_mean": 0.0023146970293055347, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005358748461731011, "epoch": 0.13890546831761605, "grad_norm": 0.07804179191589355, "kl": 0.154937744140625, "learning_rate": 1e-06, "loss": 0.014, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2991071428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 1589.2969970703125, "completions/mean_terminated_length": 956.5509643554688, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.13907910015301306, "grad_norm": 0.3039197623729706, "kl": 0.208770751953125, "learning_rate": 1e-06, "loss": 0.0535, "num_tokens": 85199808.0, "reward": 0.3191964328289032, "reward_std": 0.22289502620697021, "rewards/accuracy_reward/mean": 0.3191964328289032, "rewards/accuracy_reward/std": 0.4666863977909088, "step": 801 }, { "clip_ratio/high_max": 0.004623879438440781, "clip_ratio/high_mean": 0.001073978812655696, "clip_ratio/low_mean": 0.0008573242328111519, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0019313030295506906, "epoch": 0.13925273198841007, "grad_norm": 15.179034233093262, "kl": 0.1461181640625, "learning_rate": 1e-06, "loss": 0.0913, "step": 802 }, { "clip_ratio/high_max": 0.003996709720013314, "clip_ratio/high_mean": 0.001026532027481153, "clip_ratio/low_mean": 0.0012152552390034543, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002241787284219754, "epoch": 0.1394263638238071, "grad_norm": 1.2741217613220215, "kl": 2.100128173828125, "learning_rate": 1e-06, "loss": 0.0553, "step": 803 }, { "clip_ratio/high_max": 0.006071298303140793, "clip_ratio/high_mean": 0.0014361167109200323, "clip_ratio/low_mean": 0.001056462969245331, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002492579656518501, "epoch": 0.1395999956592041, "grad_norm": 0.13043788075447083, "kl": 0.2698211669921875, "learning_rate": 1e-06, "loss": 0.0534, "step": 804 }, { "clip_ratio/high_max": 0.007026201317785308, "clip_ratio/high_mean": 0.0017494114972578245, "clip_ratio/low_mean": 0.0015151436150517839, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032645550554661895, "epoch": 0.13977362749460112, "grad_norm": 0.10985497385263443, "kl": 0.15399169921875, "learning_rate": 1e-06, "loss": 0.0531, "step": 805 }, { "clip_ratio/high_max": 0.008192366811272223, "clip_ratio/high_mean": 0.002053507583696046, "clip_ratio/low_mean": 0.0016688928867552022, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037224004672680167, "epoch": 0.13994725932999816, "grad_norm": 0.1227654218673706, "kl": 0.1330108642578125, "learning_rate": 1e-06, "loss": 0.053, "step": 806 }, { "clip_ratio/high_max": 0.00903757687046891, "clip_ratio/high_mean": 0.0021788618050777586, "clip_ratio/low_mean": 0.0021292396327226015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00430810146190197, "epoch": 0.14012089116539517, "grad_norm": 0.12357151508331299, "kl": 0.1407012939453125, "learning_rate": 1e-06, "loss": 0.0528, "step": 807 }, { "clip_ratio/high_max": 0.010188058033236302, "clip_ratio/high_mean": 0.0024793773945930297, "clip_ratio/low_mean": 0.0020525205136436853, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004531897899141768, "epoch": 0.1402945230007922, "grad_norm": 0.12397874891757965, "kl": 0.1286468505859375, "learning_rate": 1e-06, "loss": 0.0526, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2901785714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 3072.0, "completions/mean_length": 1612.1785888671875, "completions/mean_terminated_length": 1015.3961791992188, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.1404681548361892, "grad_norm": 1.7714616060256958, "kl": 0.616455078125, "learning_rate": 1e-06, "loss": 0.0288, "num_tokens": 85985904.0, "reward": 0.3727678656578064, "reward_std": 0.24469605088233948, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "step": 809 }, { "clip_ratio/high_max": 0.006096282349972171, "clip_ratio/high_mean": 0.0015314717542196377, "clip_ratio/low_mean": 0.0006793605675738945, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002210832306445809, "epoch": 0.14064178667158622, "grad_norm": 0.12015894055366516, "kl": 0.1519622802734375, "learning_rate": 1e-06, "loss": 0.0284, "step": 810 }, { "clip_ratio/high_max": 0.0059275729981891345, "clip_ratio/high_mean": 0.0014763959159154183, "clip_ratio/low_mean": 0.000775804072191022, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002252200016300776, "epoch": 0.14081541850698326, "grad_norm": 0.1202663779258728, "kl": 0.146575927734375, "learning_rate": 1e-06, "loss": 0.0284, "step": 811 }, { "clip_ratio/high_max": 0.007012452346316422, "clip_ratio/high_mean": 0.0017443641648924313, "clip_ratio/low_mean": 0.0009279589876314276, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002672323164915724, "epoch": 0.14098905034238027, "grad_norm": 0.1141498014330864, "kl": 0.13592529296875, "learning_rate": 1e-06, "loss": 0.0281, "step": 812 }, { "clip_ratio/high_max": 0.007774580542900367, "clip_ratio/high_mean": 0.00189676758668611, "clip_ratio/low_mean": 0.0011284318798061577, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030251994321588427, "epoch": 0.1411626821777773, "grad_norm": 0.09997332841157913, "kl": 0.1361846923828125, "learning_rate": 1e-06, "loss": 0.0278, "step": 813 }, { "clip_ratio/high_max": 0.008496897376971901, "clip_ratio/high_mean": 0.002038200623701414, "clip_ratio/low_mean": 0.0013623881236526358, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034005887755483855, "epoch": 0.1413363140131743, "grad_norm": 0.08463917672634125, "kl": 0.1387786865234375, "learning_rate": 1e-06, "loss": 0.0276, "step": 814 }, { "clip_ratio/high_max": 0.008846052178341779, "clip_ratio/high_mean": 0.002116432043749228, "clip_ratio/low_mean": 0.0018328963092244521, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003949328371163574, "epoch": 0.14150994584857132, "grad_norm": 0.08638922870159149, "kl": 0.1407623291015625, "learning_rate": 1e-06, "loss": 0.0274, "step": 815 }, { "clip_ratio/high_max": 0.010045790553704137, "clip_ratio/high_mean": 0.0023482408248582942, "clip_ratio/low_mean": 0.002172278502712288, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004520519301877357, "epoch": 0.14168357768396836, "grad_norm": 0.09199316054582596, "kl": 0.14276123046875, "learning_rate": 1e-06, "loss": 0.0272, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3191964285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 1667.0179443359375, "completions/mean_terminated_length": 1008.2885131835938, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "epoch": 0.14185720951936537, "grad_norm": 0.13448764383792877, "kl": 0.1619720458984375, "learning_rate": 1e-06, "loss": 0.0383, "num_tokens": 86795744.0, "reward": 0.328125, "reward_std": 0.2585868537425995, "rewards/accuracy_reward/mean": 0.328125, "rewards/accuracy_reward/std": 0.470055490732193, "step": 817 }, { "clip_ratio/high_max": 0.0056702462843531976, "clip_ratio/high_mean": 0.0015266448299371405, "clip_ratio/low_mean": 0.0010415639690108947, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025682087671157205, "epoch": 0.1420308413547624, "grad_norm": 0.12268481403589249, "kl": 0.1597137451171875, "learning_rate": 1e-06, "loss": 0.0383, "step": 818 }, { "clip_ratio/high_max": 0.005838480134116253, "clip_ratio/high_mean": 0.0015642015359844663, "clip_ratio/low_mean": 0.0010669717262317135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026311733081456623, "epoch": 0.14220447319015941, "grad_norm": 0.12328120321035385, "kl": 0.14886474609375, "learning_rate": 1e-06, "loss": 0.0382, "step": 819 }, { "clip_ratio/high_max": 0.0061204394205560675, "clip_ratio/high_mean": 0.0017141145826826687, "clip_ratio/low_mean": 0.0011703384575412201, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002884453079332161, "epoch": 0.14237810502555642, "grad_norm": 0.11299321800470352, "kl": 0.14984130859375, "learning_rate": 1e-06, "loss": 0.0379, "step": 820 }, { "clip_ratio/high_max": 0.007737678020930616, "clip_ratio/high_mean": 0.00197274713036677, "clip_ratio/low_mean": 0.001737692863571283, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003710439952556044, "epoch": 0.14255173686095346, "grad_norm": 0.10032941401004791, "kl": 0.1511993408203125, "learning_rate": 1e-06, "loss": 0.0376, "step": 821 }, { "clip_ratio/high_max": 0.008177364357834449, "clip_ratio/high_mean": 0.0021687004489194806, "clip_ratio/low_mean": 0.002244828483526362, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004413529022713192, "epoch": 0.14272536869635047, "grad_norm": 0.09955345094203949, "kl": 0.1570281982421875, "learning_rate": 1e-06, "loss": 0.0373, "step": 822 }, { "clip_ratio/high_max": 0.01005861215708137, "clip_ratio/high_mean": 0.0025782986367630656, "clip_ratio/low_mean": 0.002768194348391262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005346493049728451, "epoch": 0.1428990005317475, "grad_norm": 0.10311928391456604, "kl": 0.1602630615234375, "learning_rate": 1e-06, "loss": 0.037, "step": 823 }, { "clip_ratio/high_max": 0.011701907526003197, "clip_ratio/high_mean": 0.0030203135529518477, "clip_ratio/low_mean": 0.003291479825747956, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006311793485110684, "epoch": 0.14307263236714451, "grad_norm": 0.09523601084947586, "kl": 0.1621856689453125, "learning_rate": 1e-06, "loss": 0.0367, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3147321428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 3060.0, "completions/mean_length": 1672.6785888671875, "completions/mean_terminated_length": 1029.993408203125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.14324626420254152, "grad_norm": 0.13028088212013245, "kl": 0.19745635986328125, "learning_rate": 1e-06, "loss": 0.0391, "num_tokens": 87610240.0, "reward": 0.3705357313156128, "reward_std": 0.2500174939632416, "rewards/accuracy_reward/mean": 0.3705357015132904, "rewards/accuracy_reward/std": 0.48348814249038696, "step": 825 }, { "clip_ratio/high_max": 0.004500452501815744, "clip_ratio/high_mean": 0.0010394788391749898, "clip_ratio/low_mean": 0.000987715425026181, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020271942448744085, "epoch": 0.14341989603793856, "grad_norm": 0.11101224273443222, "kl": 0.183441162109375, "learning_rate": 1e-06, "loss": 0.0391, "step": 826 }, { "clip_ratio/high_max": 0.005234896238107467, "clip_ratio/high_mean": 0.001247797907126369, "clip_ratio/low_mean": 0.0009559605259710224, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022037584267309285, "epoch": 0.14359352787333557, "grad_norm": 0.11052276194095612, "kl": 0.149566650390625, "learning_rate": 1e-06, "loss": 0.039, "step": 827 }, { "clip_ratio/high_max": 0.005965044289041543, "clip_ratio/high_mean": 0.001414615892826987, "clip_ratio/low_mean": 0.0011319167160763755, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002546532658925571, "epoch": 0.1437671597087326, "grad_norm": 0.10994476079940796, "kl": 0.14890289306640625, "learning_rate": 1e-06, "loss": 0.0387, "step": 828 }, { "clip_ratio/high_max": 0.006221088025995414, "clip_ratio/high_mean": 0.001469801466100762, "clip_ratio/low_mean": 0.0013713990717860725, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028412005085556302, "epoch": 0.14394079154412961, "grad_norm": 0.09687504917383194, "kl": 0.14400482177734375, "learning_rate": 1e-06, "loss": 0.0385, "step": 829 }, { "clip_ratio/high_max": 0.007762572797219036, "clip_ratio/high_mean": 0.0018034211711892567, "clip_ratio/low_mean": 0.0016928216441556287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003496242755318235, "epoch": 0.14411442337952662, "grad_norm": 0.08925654739141464, "kl": 0.1407012939453125, "learning_rate": 1e-06, "loss": 0.0383, "step": 830 }, { "clip_ratio/high_max": 0.007788682354657794, "clip_ratio/high_mean": 0.0018503316682654258, "clip_ratio/low_mean": 0.0021342637555790134, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003984595454312512, "epoch": 0.14428805521492366, "grad_norm": 0.09217309206724167, "kl": 0.14194488525390625, "learning_rate": 1e-06, "loss": 0.0381, "step": 831 }, { "clip_ratio/high_max": 0.009372119329782436, "clip_ratio/high_mean": 0.002132900726337539, "clip_ratio/low_mean": 0.0026238514140004554, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004756752192406566, "epoch": 0.14446168705032067, "grad_norm": 0.09787027537822723, "kl": 0.14357757568359375, "learning_rate": 1e-06, "loss": 0.0379, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2165178571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 2812.0, "completions/mean_length": 1513.602783203125, "completions/mean_terminated_length": 1082.9344482421875, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.1446353188857177, "grad_norm": 0.15892496705055237, "kl": 0.2413177490234375, "learning_rate": 1e-06, "loss": 0.0319, "num_tokens": 88352334.0, "reward": 0.3839285969734192, "reward_std": 0.27249112725257874, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688456416130066, "step": 833 }, { "clip_ratio/high_max": 0.0053078828350408, "clip_ratio/high_mean": 0.0015349199920819956, "clip_ratio/low_mean": 0.0007772596854920266, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002312179696673411, "epoch": 0.14480895072111472, "grad_norm": 0.12118658423423767, "kl": 0.17218017578125, "learning_rate": 1e-06, "loss": 0.0319, "step": 834 }, { "clip_ratio/high_max": 0.006532325505759218, "clip_ratio/high_mean": 0.001831927120747423, "clip_ratio/low_mean": 0.0009609118724256405, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002792838959067012, "epoch": 0.14498258255651172, "grad_norm": 0.1274677962064743, "kl": 0.1710205078125, "learning_rate": 1e-06, "loss": 0.0317, "step": 835 }, { "clip_ratio/high_max": 0.006988576908042887, "clip_ratio/high_mean": 0.0019819164472210105, "clip_ratio/low_mean": 0.001055178473052365, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030370949370990274, "epoch": 0.14515621439190876, "grad_norm": 0.12129134684801102, "kl": 0.1470947265625, "learning_rate": 1e-06, "loss": 0.0314, "step": 836 }, { "clip_ratio/high_max": 0.00731004268527613, "clip_ratio/high_mean": 0.0021044423401690437, "clip_ratio/low_mean": 0.0013317167481545766, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034361590933258412, "epoch": 0.14532984622730577, "grad_norm": 0.10082930326461792, "kl": 0.149200439453125, "learning_rate": 1e-06, "loss": 0.0311, "step": 837 }, { "clip_ratio/high_max": 0.00760839232680155, "clip_ratio/high_mean": 0.0021708261583626154, "clip_ratio/low_mean": 0.0016991147576845833, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038699409160471987, "epoch": 0.1455034780627028, "grad_norm": 0.09723855555057526, "kl": 0.1516571044921875, "learning_rate": 1e-06, "loss": 0.0308, "step": 838 }, { "clip_ratio/high_max": 0.009051359877048526, "clip_ratio/high_mean": 0.0026643742439773632, "clip_ratio/low_mean": 0.002212669179243676, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004877043342276011, "epoch": 0.14567710989809982, "grad_norm": 0.11088015884160995, "kl": 0.1569061279296875, "learning_rate": 1e-06, "loss": 0.0306, "step": 839 }, { "clip_ratio/high_max": 0.011014060546585824, "clip_ratio/high_mean": 0.0031917121577862417, "clip_ratio/low_mean": 0.0025846572234513587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0057763695367611945, "epoch": 0.14585074173349682, "grad_norm": 0.10320910066366196, "kl": 0.155426025390625, "learning_rate": 1e-06, "loss": 0.0303, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3048.0, "completions/mean_length": 1559.6629638671875, "completions/mean_terminated_length": 967.87890625, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "epoch": 0.14602437356889386, "grad_norm": 0.1045524850487709, "kl": 0.168609619140625, "learning_rate": 1e-06, "loss": 0.0277, "num_tokens": 89114359.0, "reward": 0.3258928656578064, "reward_std": 0.19215652346611023, "rewards/accuracy_reward/mean": 0.3258928656578064, "rewards/accuracy_reward/std": 0.4692314565181732, "step": 841 }, { "clip_ratio/high_max": 0.003974321161877015, "clip_ratio/high_mean": 0.0008860767438818584, "clip_ratio/low_mean": 0.0007417930927431371, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016278698221867671, "epoch": 0.14619800540429087, "grad_norm": 0.09701497107744217, "kl": 0.1612396240234375, "learning_rate": 1e-06, "loss": 0.0278, "step": 842 }, { "clip_ratio/high_max": 0.00462410819818615, "clip_ratio/high_mean": 0.0010549737803557946, "clip_ratio/low_mean": 0.0008310013445225195, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018859751094169042, "epoch": 0.1463716372396879, "grad_norm": 0.09846173226833344, "kl": 0.14605712890625, "learning_rate": 1e-06, "loss": 0.0276, "step": 843 }, { "clip_ratio/high_max": 0.005125424489961006, "clip_ratio/high_mean": 0.0011085587289016985, "clip_ratio/low_mean": 0.00111258455638108, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022211432865333336, "epoch": 0.14654526907508492, "grad_norm": 0.09294673800468445, "kl": 0.140777587890625, "learning_rate": 1e-06, "loss": 0.0274, "step": 844 }, { "clip_ratio/high_max": 0.0056071074159262935, "clip_ratio/high_mean": 0.0012237981841280998, "clip_ratio/low_mean": 0.0012883734093520616, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002512171630769444, "epoch": 0.14671890091048193, "grad_norm": 0.08560372143983841, "kl": 0.1399688720703125, "learning_rate": 1e-06, "loss": 0.0272, "step": 845 }, { "clip_ratio/high_max": 0.006548294746608008, "clip_ratio/high_mean": 0.001425031568032864, "clip_ratio/low_mean": 0.0013761298891949991, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028011614485876635, "epoch": 0.14689253274587896, "grad_norm": 0.08029899001121521, "kl": 0.1375274658203125, "learning_rate": 1e-06, "loss": 0.027, "step": 846 }, { "clip_ratio/high_max": 0.006986761109146755, "clip_ratio/high_mean": 0.0015664685297451797, "clip_ratio/low_mean": 0.0017534331102524447, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003319901690701954, "epoch": 0.14706616458127597, "grad_norm": 0.07706810534000397, "kl": 0.14447021484375, "learning_rate": 1e-06, "loss": 0.0268, "step": 847 }, { "clip_ratio/high_max": 0.008188834392058197, "clip_ratio/high_mean": 0.001747937751133577, "clip_ratio/low_mean": 0.0024257181203211076, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004173655945123755, "epoch": 0.147239796416673, "grad_norm": 0.0802474394440651, "kl": 0.1457977294921875, "learning_rate": 1e-06, "loss": 0.0266, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3236607142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 2802.0, "completions/mean_length": 1721.4442138671875, "completions/mean_terminated_length": 1075.138671875, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.14741342825207002, "grad_norm": 0.1143452599644661, "kl": 0.189178466796875, "learning_rate": 1e-06, "loss": 0.0403, "num_tokens": 89950438.0, "reward": 0.2946428656578064, "reward_std": 0.24318340420722961, "rewards/accuracy_reward/mean": 0.2946428656578064, "rewards/accuracy_reward/std": 0.45639169216156006, "step": 849 }, { "clip_ratio/high_max": 0.005030961301599746, "clip_ratio/high_mean": 0.0011178807740179764, "clip_ratio/low_mean": 0.0009205483306686801, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020384291019581724, "epoch": 0.14758706008746703, "grad_norm": 0.10094951093196869, "kl": 0.182525634765625, "learning_rate": 1e-06, "loss": 0.0404, "step": 850 }, { "clip_ratio/high_max": 0.005018846626626328, "clip_ratio/high_mean": 0.0011499578390612442, "clip_ratio/low_mean": 0.0010897898946495843, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022397477314370917, "epoch": 0.14776069192286406, "grad_norm": 0.0933699756860733, "kl": 0.1655120849609375, "learning_rate": 1e-06, "loss": 0.0402, "step": 851 }, { "clip_ratio/high_max": 0.005682444665580988, "clip_ratio/high_mean": 0.0013046574363215768, "clip_ratio/low_mean": 0.001181661034024728, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002486318498995388, "epoch": 0.14793432375826107, "grad_norm": 0.10236528515815735, "kl": 0.15899658203125, "learning_rate": 1e-06, "loss": 0.04, "step": 852 }, { "clip_ratio/high_max": 0.007136707932659192, "clip_ratio/high_mean": 0.0015666873896407196, "clip_ratio/low_mean": 0.0012595998459801194, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002826287238349323, "epoch": 0.1481079555936581, "grad_norm": 0.13872037827968597, "kl": 0.15404510498046875, "learning_rate": 1e-06, "loss": 0.0399, "step": 853 }, { "clip_ratio/high_max": 0.0073713755282369675, "clip_ratio/high_mean": 0.0016229079606091545, "clip_ratio/low_mean": 0.0015812665251360158, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032041744925663806, "epoch": 0.14828158742905512, "grad_norm": 0.09043584018945694, "kl": 0.1645355224609375, "learning_rate": 1e-06, "loss": 0.0396, "step": 854 }, { "clip_ratio/high_max": 0.009104020737140672, "clip_ratio/high_mean": 0.001914165649395727, "clip_ratio/low_mean": 0.0018903427971963538, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003804508400207851, "epoch": 0.14845521926445213, "grad_norm": 0.08749133348464966, "kl": 0.1737213134765625, "learning_rate": 1e-06, "loss": 0.0394, "step": 855 }, { "clip_ratio/high_max": 0.010631052587996237, "clip_ratio/high_mean": 0.002221706677119073, "clip_ratio/low_mean": 0.002270795146614546, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0044925017609784845, "epoch": 0.14862885109984916, "grad_norm": 0.09722864627838135, "kl": 0.194488525390625, "learning_rate": 1e-06, "loss": 0.0392, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2723214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 2678.0, "completions/mean_length": 1524.5692138671875, "completions/mean_terminated_length": 945.4692993164062, "completions/min_length": 225.0, "completions/min_terminated_length": 225.0, "epoch": 0.14880248293524617, "grad_norm": 0.14486859738826752, "kl": 0.19152069091796875, "learning_rate": 1e-06, "loss": 0.0709, "num_tokens": 90690261.0, "reward": 0.3727678656578064, "reward_std": 0.27557438611984253, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "step": 857 }, { "clip_ratio/high_max": 0.0052154557743051555, "clip_ratio/high_mean": 0.001469916244786873, "clip_ratio/low_mean": 0.0007574980913886975, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022274143375398125, "epoch": 0.1489761147706432, "grad_norm": 0.10669851303100586, "kl": 0.16193389892578125, "learning_rate": 1e-06, "loss": 0.0709, "step": 858 }, { "clip_ratio/high_max": 0.0071086146162997466, "clip_ratio/high_mean": 0.0020535551193461288, "clip_ratio/low_mean": 0.00063753304743841, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026910881824733224, "epoch": 0.14914974660604022, "grad_norm": 0.11527223140001297, "kl": 0.14356231689453125, "learning_rate": 1e-06, "loss": 0.0708, "step": 859 }, { "clip_ratio/high_max": 0.007853059734770795, "clip_ratio/high_mean": 0.002179262686695438, "clip_ratio/low_mean": 0.0009163382474071113, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003095600910455687, "epoch": 0.14932337844143723, "grad_norm": 0.1114146038889885, "kl": 0.1420135498046875, "learning_rate": 1e-06, "loss": 0.0705, "step": 860 }, { "clip_ratio/high_max": 0.007900789729319513, "clip_ratio/high_mean": 0.002303321485669585, "clip_ratio/low_mean": 0.0010922002393272123, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033955216931644827, "epoch": 0.14949701027683426, "grad_norm": 0.097746841609478, "kl": 0.14189910888671875, "learning_rate": 1e-06, "loss": 0.0702, "step": 861 }, { "clip_ratio/high_max": 0.009193909329042071, "clip_ratio/high_mean": 0.002544503922763397, "clip_ratio/low_mean": 0.0012546804132398393, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037991842946212273, "epoch": 0.14967064211223127, "grad_norm": 0.09073591232299805, "kl": 0.13832855224609375, "learning_rate": 1e-06, "loss": 0.07, "step": 862 }, { "clip_ratio/high_max": 0.009481766886892729, "clip_ratio/high_mean": 0.0026476747889319086, "clip_ratio/low_mean": 0.001663233610088355, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004310908345360076, "epoch": 0.1498442739476283, "grad_norm": 0.0889631062746048, "kl": 0.14571380615234375, "learning_rate": 1e-06, "loss": 0.0698, "step": 863 }, { "clip_ratio/high_max": 0.011594357398280408, "clip_ratio/high_mean": 0.0031343910977739142, "clip_ratio/low_mean": 0.0021074853648315184, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005241876515356125, "epoch": 0.15001790578302532, "grad_norm": 0.08698555827140808, "kl": 0.14251708984375, "learning_rate": 1e-06, "loss": 0.0695, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 3072.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 1353.10498046875, "completions/mean_terminated_length": 914.9552001953125, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.15019153761842233, "grad_norm": 8.078048706054688, "kl": 3.991973876953125, "learning_rate": 1e-06, "loss": 0.0492, "num_tokens": 91356116.0, "reward": 0.4464285969734192, "reward_std": 0.3001542091369629, "rewards/accuracy_reward/mean": 0.4464285671710968, "rewards/accuracy_reward/std": 0.49767759442329407, "step": 865 }, { "clip_ratio/high_max": 0.008058459476160351, "clip_ratio/high_mean": 0.0021523788691411028, "clip_ratio/low_mean": 0.0009899353390210308, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031423141990671866, "epoch": 0.15036516945381936, "grad_norm": 0.16608008742332458, "kl": 0.202239990234375, "learning_rate": 1e-06, "loss": 0.0457, "step": 866 }, { "clip_ratio/high_max": 0.008411450595303904, "clip_ratio/high_mean": 0.0023030985503282864, "clip_ratio/low_mean": 0.001073062207979092, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033761608028726187, "epoch": 0.15053880128921637, "grad_norm": 0.16622518002986908, "kl": 0.18951416015625, "learning_rate": 1e-06, "loss": 0.0454, "step": 867 }, { "clip_ratio/high_max": 0.008667983005580027, "clip_ratio/high_mean": 0.002392343078099657, "clip_ratio/low_mean": 0.001348203352335986, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003740546431799885, "epoch": 0.1507124331246134, "grad_norm": 0.1412827968597412, "kl": 0.1951141357421875, "learning_rate": 1e-06, "loss": 0.0451, "step": 868 }, { "clip_ratio/high_max": 0.010836924084287602, "clip_ratio/high_mean": 0.002884870633351966, "clip_ratio/low_mean": 0.0016153793624198443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0045002499828115106, "epoch": 0.15088606496001042, "grad_norm": 0.1274917870759964, "kl": 0.179290771484375, "learning_rate": 1e-06, "loss": 0.0447, "step": 869 }, { "clip_ratio/high_max": 0.011616119736572728, "clip_ratio/high_mean": 0.0030119990005914588, "clip_ratio/low_mean": 0.00215753332577151, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005169532145373523, "epoch": 0.15105969679540743, "grad_norm": 0.12527112662792206, "kl": 0.1872100830078125, "learning_rate": 1e-06, "loss": 0.0444, "step": 870 }, { "clip_ratio/high_max": 0.01220913085853681, "clip_ratio/high_mean": 0.0033669365038804244, "clip_ratio/low_mean": 0.0027959510730397596, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00616288749006344, "epoch": 0.15123332863080446, "grad_norm": 0.13680215179920197, "kl": 0.19427490234375, "learning_rate": 1e-06, "loss": 0.0441, "step": 871 }, { "clip_ratio/high_max": 0.01476610332611017, "clip_ratio/high_mean": 0.0038485567638417706, "clip_ratio/low_mean": 0.003591475360735785, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007440032146405429, "epoch": 0.15140696046620147, "grad_norm": 0.12479729950428009, "kl": 0.1935272216796875, "learning_rate": 1e-06, "loss": 0.0437, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2723214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 1501.6004638671875, "completions/mean_terminated_length": 913.9049072265625, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.1515805923015985, "grad_norm": 0.3062911033630371, "kl": 0.3077392578125, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 92089169.0, "reward": 0.3482142984867096, "reward_std": 0.2616059184074402, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.476936936378479, "step": 873 }, { "clip_ratio/high_max": 0.006005596591421636, "clip_ratio/high_mean": 0.0014710901355101669, "clip_ratio/low_mean": 0.001218115859956015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002689206026843749, "epoch": 0.15175422413699552, "grad_norm": 5.979277610778809, "kl": 0.229644775390625, "learning_rate": 1e-06, "loss": 0.0334, "step": 874 }, { "clip_ratio/high_max": 0.007493168805012829, "clip_ratio/high_mean": 0.0018092055065608292, "clip_ratio/low_mean": 0.0011584153407966369, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002967620897834422, "epoch": 0.15192785597239253, "grad_norm": 104.80470275878906, "kl": 37.444183349609375, "learning_rate": 1e-06, "loss": 0.07, "step": 875 }, { "clip_ratio/high_max": 0.00941884516942082, "clip_ratio/high_mean": 0.0023253675954038044, "clip_ratio/low_mean": 0.0012891544174635783, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003614522000134457, "epoch": 0.15210148780778956, "grad_norm": 1.1972529888153076, "kl": 0.6775054931640625, "learning_rate": 1e-06, "loss": 0.0332, "step": 876 }, { "clip_ratio/high_max": 0.009923402216372779, "clip_ratio/high_mean": 0.002382263496201631, "clip_ratio/low_mean": 0.0017402880330337211, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0041225515233236365, "epoch": 0.15227511964318657, "grad_norm": 30.32253074645996, "kl": 0.1679229736328125, "learning_rate": 1e-06, "loss": 0.0357, "step": 877 }, { "clip_ratio/high_max": 0.012175648225820623, "clip_ratio/high_mean": 0.0029964989189465996, "clip_ratio/low_mean": 0.0020207380530337105, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0050172368792118505, "epoch": 0.1524487514785836, "grad_norm": 0.2968408763408661, "kl": 0.26641845703125, "learning_rate": 1e-06, "loss": 0.0328, "step": 878 }, { "clip_ratio/high_max": 0.013290120899910107, "clip_ratio/high_mean": 0.0032297097004629904, "clip_ratio/low_mean": 0.002114436199917691, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005344145822164137, "epoch": 0.15262238331398062, "grad_norm": 1415.5841064453125, "kl": 844.1442260742188, "learning_rate": 1e-06, "loss": 0.8777, "step": 879 }, { "clip_ratio/high_max": 0.01374676847626688, "clip_ratio/high_mean": 0.003165403037201031, "clip_ratio/low_mean": 0.00200554638831818, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0051709493636735715, "epoch": 0.15279601514937763, "grad_norm": 655.6858520507812, "kl": 512.1507110595703, "learning_rate": 1e-06, "loss": 0.5441, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2991071428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 3059.0, "completions/mean_length": 1627.2254638671875, "completions/mean_terminated_length": 1010.6656494140625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.15296964698477467, "grad_norm": 0.11541352421045303, "kl": 0.14556884765625, "learning_rate": 1e-06, "loss": 0.0403, "num_tokens": 92881750.0, "reward": 0.3258928656578064, "reward_std": 0.24964658915996552, "rewards/accuracy_reward/mean": 0.3258928656578064, "rewards/accuracy_reward/std": 0.46923142671585083, "step": 881 }, { "clip_ratio/high_max": 0.004717241230537184, "clip_ratio/high_mean": 0.0012808635474357288, "clip_ratio/low_mean": 0.0015374835072634596, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028183470749354456, "epoch": 0.15314327882017167, "grad_norm": 0.15199480950832367, "kl": 0.185699462890625, "learning_rate": 1e-06, "loss": 0.0405, "step": 882 }, { "clip_ratio/high_max": 0.005958028883469524, "clip_ratio/high_mean": 0.00162895063203905, "clip_ratio/low_mean": 0.0020991795786358125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003728130199306179, "epoch": 0.1533169106555687, "grad_norm": 0.18723994493484497, "kl": 0.199310302734375, "learning_rate": 1e-06, "loss": 0.0404, "step": 883 }, { "clip_ratio/high_max": 0.006515397923067212, "clip_ratio/high_mean": 0.001903494192447397, "clip_ratio/low_mean": 0.0023285353604478587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004232029514241731, "epoch": 0.15349054249096572, "grad_norm": 0.1845727264881134, "kl": 0.19329833984375, "learning_rate": 1e-06, "loss": 0.0403, "step": 884 }, { "clip_ratio/high_max": 0.007016907329671085, "clip_ratio/high_mean": 0.0019444639219727833, "clip_ratio/low_mean": 0.0025036533379534376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004448117200809065, "epoch": 0.15366417432636273, "grad_norm": 0.16492821276187897, "kl": 0.16790771484375, "learning_rate": 1e-06, "loss": 0.04, "step": 885 }, { "clip_ratio/high_max": 0.008025230003113393, "clip_ratio/high_mean": 0.0021938635163678555, "clip_ratio/low_mean": 0.0022905651389919512, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004484428696741816, "epoch": 0.15383780616175977, "grad_norm": 0.12576481699943542, "kl": 0.14166259765625, "learning_rate": 1e-06, "loss": 0.0397, "step": 886 }, { "clip_ratio/high_max": 0.009072638422367163, "clip_ratio/high_mean": 0.002443031611619517, "clip_ratio/low_mean": 0.0023167934450611938, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004759825043947785, "epoch": 0.15401143799715677, "grad_norm": 0.10946796834468842, "kl": 0.1335296630859375, "learning_rate": 1e-06, "loss": 0.0393, "step": 887 }, { "clip_ratio/high_max": 0.010184858190768864, "clip_ratio/high_mean": 0.002793037523588282, "clip_ratio/low_mean": 0.0022350986664605443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005028136138207628, "epoch": 0.1541850698325538, "grad_norm": 0.12533190846443176, "kl": 0.1181182861328125, "learning_rate": 1e-06, "loss": 0.0391, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2633928571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 2970.0, "completions/mean_length": 1550.0960693359375, "completions/mean_terminated_length": 1005.8999633789062, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.15435870166795082, "grad_norm": 0.11338287591934204, "kl": 0.1327972412109375, "learning_rate": 1e-06, "loss": 0.0445, "num_tokens": 93642033.0, "reward": 0.392857164144516, "reward_std": 0.2282341867685318, "rewards/accuracy_reward/mean": 0.3928571343421936, "rewards/accuracy_reward/std": 0.48893147706985474, "step": 889 }, { "clip_ratio/high_max": 0.006820378956035711, "clip_ratio/high_mean": 0.001664427257310308, "clip_ratio/low_mean": 0.0006397442957677413, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002304171544892597, "epoch": 0.15453233350334783, "grad_norm": 0.10192667692899704, "kl": 0.13323211669921875, "learning_rate": 1e-06, "loss": 0.0446, "step": 890 }, { "clip_ratio/high_max": 0.007388663339952473, "clip_ratio/high_mean": 0.001906324543597293, "clip_ratio/low_mean": 0.0007081802091306599, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002614504775920068, "epoch": 0.15470596533874487, "grad_norm": 0.10940509289503098, "kl": 0.12957000732421875, "learning_rate": 1e-06, "loss": 0.0444, "step": 891 }, { "clip_ratio/high_max": 0.00858733960558311, "clip_ratio/high_mean": 0.002030277784797363, "clip_ratio/low_mean": 0.0010388281816631206, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003069105989197851, "epoch": 0.15487959717414188, "grad_norm": 0.1064591109752655, "kl": 0.13448333740234375, "learning_rate": 1e-06, "loss": 0.0442, "step": 892 }, { "clip_ratio/high_max": 0.009596916985174175, "clip_ratio/high_mean": 0.0023207257318063057, "clip_ratio/low_mean": 0.0011265199077570287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034472456718503963, "epoch": 0.1550532290095389, "grad_norm": 0.10585889965295792, "kl": 0.1353607177734375, "learning_rate": 1e-06, "loss": 0.044, "step": 893 }, { "clip_ratio/high_max": 0.010601307905744761, "clip_ratio/high_mean": 0.002443002213112777, "clip_ratio/low_mean": 0.0013380058817347162, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037810081648785854, "epoch": 0.15522686084493592, "grad_norm": 0.09518319368362427, "kl": 0.1323699951171875, "learning_rate": 1e-06, "loss": 0.0438, "step": 894 }, { "clip_ratio/high_max": 0.012033580391289433, "clip_ratio/high_mean": 0.002780898721084668, "clip_ratio/low_mean": 0.0015114724874365493, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0042923712126139435, "epoch": 0.15540049268033293, "grad_norm": 0.09192158281803131, "kl": 0.13518524169921875, "learning_rate": 1e-06, "loss": 0.0436, "step": 895 }, { "clip_ratio/high_max": 0.013536700967961224, "clip_ratio/high_mean": 0.0030581116643588757, "clip_ratio/low_mean": 0.0017890673359488574, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004847179070566199, "epoch": 0.15557412451572997, "grad_norm": 0.08484160900115967, "kl": 0.1328887939453125, "learning_rate": 1e-06, "loss": 0.0433, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 2912.0, "completions/mean_length": 1514.1563720703125, "completions/mean_terminated_length": 969.849365234375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "epoch": 0.15574775635112698, "grad_norm": 0.1522524505853653, "kl": 0.2014617919921875, "learning_rate": 1e-06, "loss": 0.0564, "num_tokens": 94379167.0, "reward": 0.3861607313156128, "reward_std": 0.29555460810661316, "rewards/accuracy_reward/mean": 0.3861607015132904, "rewards/accuracy_reward/std": 0.4874124228954315, "step": 897 }, { "clip_ratio/high_max": 0.00550125688459957, "clip_ratio/high_mean": 0.0015243187017404125, "clip_ratio/low_mean": 0.0009715012865854078, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024958200119726826, "epoch": 0.155921388186524, "grad_norm": 0.5123240351676941, "kl": 0.185302734375, "learning_rate": 1e-06, "loss": 0.0565, "step": 898 }, { "clip_ratio/high_max": 0.006912529563123826, "clip_ratio/high_mean": 0.0018884971095758374, "clip_ratio/low_mean": 0.0011846136499116255, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003073110812692903, "epoch": 0.15609502002192102, "grad_norm": 0.13096073269844055, "kl": 0.1790771484375, "learning_rate": 1e-06, "loss": 0.0562, "step": 899 }, { "clip_ratio/high_max": 0.00815147483808687, "clip_ratio/high_mean": 0.002163268442018307, "clip_ratio/low_mean": 0.0013451540871756151, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003508422480081208, "epoch": 0.15626865185731803, "grad_norm": 0.12589219212532043, "kl": 0.1631927490234375, "learning_rate": 1e-06, "loss": 0.0559, "step": 900 }, { "clip_ratio/high_max": 0.010154776624403894, "clip_ratio/high_mean": 0.0025176213875965914, "clip_ratio/low_mean": 0.0017594529699636041, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004277074367564637, "epoch": 0.15644228369271507, "grad_norm": 0.11508487164974213, "kl": 0.154815673828125, "learning_rate": 1e-06, "loss": 0.0555, "step": 901 }, { "clip_ratio/high_max": 0.011173778962984215, "clip_ratio/high_mean": 0.0027963733500655508, "clip_ratio/low_mean": 0.0021315905314622796, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004927963847876526, "epoch": 0.15661591552811208, "grad_norm": 0.1029396802186966, "kl": 0.1509552001953125, "learning_rate": 1e-06, "loss": 0.0551, "step": 902 }, { "clip_ratio/high_max": 0.01246674996218644, "clip_ratio/high_mean": 0.003082229739447939, "clip_ratio/low_mean": 0.002769789383819443, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005852019108715467, "epoch": 0.1567895473635091, "grad_norm": 0.09859399497509003, "kl": 0.1487884521484375, "learning_rate": 1e-06, "loss": 0.0548, "step": 903 }, { "clip_ratio/high_max": 0.014319734975288156, "clip_ratio/high_mean": 0.00344792406031047, "clip_ratio/low_mean": 0.0032204151721089147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00666833930881694, "epoch": 0.15696317919890612, "grad_norm": 0.10959434509277344, "kl": 0.149078369140625, "learning_rate": 1e-06, "loss": 0.0545, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2723214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 2791.0, "completions/mean_length": 1584.134033203125, "completions/mean_terminated_length": 1027.3250732421875, "completions/min_length": 259.0, "completions/min_terminated_length": 259.0, "epoch": 0.15713681103430313, "grad_norm": 0.13496579229831696, "kl": 0.1616973876953125, "learning_rate": 1e-06, "loss": 0.0154, "num_tokens": 95152291.0, "reward": 0.3549107313156128, "reward_std": 0.2540750503540039, "rewards/accuracy_reward/mean": 0.3549107015132904, "rewards/accuracy_reward/std": 0.4790211319923401, "step": 905 }, { "clip_ratio/high_max": 0.004416956835484598, "clip_ratio/high_mean": 0.0010406354622318759, "clip_ratio/low_mean": 0.0010507342594792135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020913697280775523, "epoch": 0.15731044286970017, "grad_norm": 0.11827404797077179, "kl": 0.1520843505859375, "learning_rate": 1e-06, "loss": 0.0154, "step": 906 }, { "clip_ratio/high_max": 0.004820735408429755, "clip_ratio/high_mean": 0.001163434339559899, "clip_ratio/low_mean": 0.0011278019046585541, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002291236254677642, "epoch": 0.15748407470509718, "grad_norm": 0.11627812683582306, "kl": 0.1588287353515625, "learning_rate": 1e-06, "loss": 0.0153, "step": 907 }, { "clip_ratio/high_max": 0.005921167830820195, "clip_ratio/high_mean": 0.0014056158279345254, "clip_ratio/low_mean": 0.0013436933704724652, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027493091620272025, "epoch": 0.15765770654049419, "grad_norm": 0.1091492772102356, "kl": 0.1491241455078125, "learning_rate": 1e-06, "loss": 0.015, "step": 908 }, { "clip_ratio/high_max": 0.007158823435020167, "clip_ratio/high_mean": 0.0016075302623903553, "clip_ratio/low_mean": 0.0015963310270308284, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003203861298970878, "epoch": 0.15783133837589122, "grad_norm": 0.10381630808115005, "kl": 0.1412200927734375, "learning_rate": 1e-06, "loss": 0.0147, "step": 909 }, { "clip_ratio/high_max": 0.008181025936210062, "clip_ratio/high_mean": 0.0018038738598988857, "clip_ratio/low_mean": 0.0020721598284580978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0038760336647101212, "epoch": 0.15800497021128823, "grad_norm": 0.09968207776546478, "kl": 0.141998291015625, "learning_rate": 1e-06, "loss": 0.0145, "step": 910 }, { "clip_ratio/high_max": 0.01036207024299074, "clip_ratio/high_mean": 0.00228525260627066, "clip_ratio/low_mean": 0.002138003135769395, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004423255715664709, "epoch": 0.15817860204668527, "grad_norm": 0.09740278124809265, "kl": 0.147064208984375, "learning_rate": 1e-06, "loss": 0.0142, "step": 911 }, { "clip_ratio/high_max": 0.011924916245334316, "clip_ratio/high_mean": 0.0026104566732101375, "clip_ratio/low_mean": 0.0026423565268487437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005252813218248775, "epoch": 0.15835223388208228, "grad_norm": 0.09597580134868622, "kl": 0.1536865234375, "learning_rate": 1e-06, "loss": 0.0139, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2723214285714286, "completions/max_length": 3072.0, "completions/max_terminated_length": 2741.0, "completions/mean_length": 1516.1251220703125, "completions/mean_terminated_length": 933.864990234375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.1585258657174793, "grad_norm": 0.14744508266448975, "kl": 0.2043609619140625, "learning_rate": 1e-06, "loss": 0.032, "num_tokens": 95895731.0, "reward": 0.377232164144516, "reward_std": 0.22198715806007385, "rewards/accuracy_reward/mean": 0.3772321343421936, "rewards/accuracy_reward/std": 0.48523563146591187, "step": 913 }, { "clip_ratio/high_max": 0.005810086475321441, "clip_ratio/high_mean": 0.0013954947030470066, "clip_ratio/low_mean": 0.0007371965027118677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00213269117193704, "epoch": 0.15869949755287632, "grad_norm": 0.1261293888092041, "kl": 0.1912689208984375, "learning_rate": 1e-06, "loss": 0.032, "step": 914 }, { "clip_ratio/high_max": 0.006491629446827574, "clip_ratio/high_mean": 0.001499600093211484, "clip_ratio/low_mean": 0.0008339546888009863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023335547739407048, "epoch": 0.15887312938827333, "grad_norm": 0.11715462058782578, "kl": 0.1907196044921875, "learning_rate": 1e-06, "loss": 0.0319, "step": 915 }, { "clip_ratio/high_max": 0.008485343767461018, "clip_ratio/high_mean": 0.0019476959701023588, "clip_ratio/low_mean": 0.0009233188588382291, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028710148217214737, "epoch": 0.15904676122367037, "grad_norm": 0.11355358362197876, "kl": 0.1687164306640625, "learning_rate": 1e-06, "loss": 0.0316, "step": 916 }, { "clip_ratio/high_max": 0.010508988270885311, "clip_ratio/high_mean": 0.0023147043343669793, "clip_ratio/low_mean": 0.001193634039623248, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035083384082099656, "epoch": 0.15922039305906738, "grad_norm": 0.10846663266420364, "kl": 0.161224365234375, "learning_rate": 1e-06, "loss": 0.0313, "step": 917 }, { "clip_ratio/high_max": 0.01150400781261851, "clip_ratio/high_mean": 0.0025653880052232125, "clip_ratio/low_mean": 0.0014605403652012683, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004025928406917956, "epoch": 0.1593940248944644, "grad_norm": 0.09597496688365936, "kl": 0.155120849609375, "learning_rate": 1e-06, "loss": 0.031, "step": 918 }, { "clip_ratio/high_max": 0.013127522655850044, "clip_ratio/high_mean": 0.0028911758008689503, "clip_ratio/low_mean": 0.0018539290847456869, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004745104875837569, "epoch": 0.15956765672986142, "grad_norm": 0.08873812854290009, "kl": 0.15716552734375, "learning_rate": 1e-06, "loss": 0.0307, "step": 919 }, { "clip_ratio/high_max": 0.014764678657229524, "clip_ratio/high_mean": 0.0032963523972284747, "clip_ratio/low_mean": 0.0023280933216938138, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005624445740977535, "epoch": 0.15974128856525843, "grad_norm": 0.0859653577208519, "kl": 0.160675048828125, "learning_rate": 1e-06, "loss": 0.0304, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2232142857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 2910.0, "completions/mean_length": 1424.6251220703125, "completions/mean_terminated_length": 951.2413940429688, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.15991492040065547, "grad_norm": 0.25446581840515137, "kl": 0.21942138671875, "learning_rate": 1e-06, "loss": 0.0326, "num_tokens": 96599507.0, "reward": 0.408482164144516, "reward_std": 0.20245972275733948, "rewards/accuracy_reward/mean": 0.4084821343421936, "rewards/accuracy_reward/std": 0.49210265278816223, "step": 921 }, { "clip_ratio/high_max": 0.004421700243256055, "clip_ratio/high_mean": 0.00100727454423577, "clip_ratio/low_mean": 0.0006827429704117094, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016900175351111102, "epoch": 0.16008855223605248, "grad_norm": 0.09727432578802109, "kl": 0.1445159912109375, "learning_rate": 1e-06, "loss": 0.0326, "step": 922 }, { "clip_ratio/high_max": 0.004937046121995081, "clip_ratio/high_mean": 0.0011141099118958664, "clip_ratio/low_mean": 0.0006963989781070268, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018105088829543092, "epoch": 0.1602621840714495, "grad_norm": 0.0934927687048912, "kl": 0.132110595703125, "learning_rate": 1e-06, "loss": 0.0325, "step": 923 }, { "clip_ratio/high_max": 0.005673978794220602, "clip_ratio/high_mean": 0.0012879318574050558, "clip_ratio/low_mean": 0.0008385908959098742, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002126522740582004, "epoch": 0.16043581590684652, "grad_norm": 0.09041892737150192, "kl": 0.127197265625, "learning_rate": 1e-06, "loss": 0.0323, "step": 924 }, { "clip_ratio/high_max": 0.006525620203319704, "clip_ratio/high_mean": 0.0014603779636672698, "clip_ratio/low_mean": 0.0010899894405156374, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025503673750790767, "epoch": 0.16060944774224353, "grad_norm": 0.08873995393514633, "kl": 0.125030517578125, "learning_rate": 1e-06, "loss": 0.0321, "step": 925 }, { "clip_ratio/high_max": 0.007691851609706646, "clip_ratio/high_mean": 0.0017400943042957806, "clip_ratio/low_mean": 0.0013741323937210836, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003114226661637076, "epoch": 0.16078307957764057, "grad_norm": 0.08343217521905899, "kl": 0.1224365234375, "learning_rate": 1e-06, "loss": 0.0318, "step": 926 }, { "clip_ratio/high_max": 0.007834755495423451, "clip_ratio/high_mean": 0.0017878342669064295, "clip_ratio/low_mean": 0.0016269183270196663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034147526130254846, "epoch": 0.16095671141303758, "grad_norm": 0.07612474262714386, "kl": 0.130340576171875, "learning_rate": 1e-06, "loss": 0.0316, "step": 927 }, { "clip_ratio/high_max": 0.009047457257111091, "clip_ratio/high_mean": 0.002010950095609587, "clip_ratio/low_mean": 0.002057542324109818, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004068492477017571, "epoch": 0.1611303432484346, "grad_norm": 0.07373934984207153, "kl": 0.13055419921875, "learning_rate": 1e-06, "loss": 0.0314, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2477678571428571, "completions/max_length": 3072.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 1481.58935546875, "completions/mean_terminated_length": 957.7448120117188, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.16130397508383162, "grad_norm": 0.131914421916008, "kl": 0.161407470703125, "learning_rate": 1e-06, "loss": 0.0296, "num_tokens": 97322291.0, "reward": 0.3839285969734192, "reward_std": 0.2548309564590454, "rewards/accuracy_reward/mean": 0.3839285671710968, "rewards/accuracy_reward/std": 0.48688453435897827, "step": 929 }, { "clip_ratio/high_max": 0.005351330913981656, "clip_ratio/high_mean": 0.001393319860653719, "clip_ratio/low_mean": 0.0007540787455582176, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00214739861723956, "epoch": 0.16147760691922863, "grad_norm": 0.11463475227355957, "kl": 0.16900634765625, "learning_rate": 1e-06, "loss": 0.0297, "step": 930 }, { "clip_ratio/high_max": 0.005783101973065641, "clip_ratio/high_mean": 0.0015186478851774154, "clip_ratio/low_mean": 0.0009552153096592519, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002473863190289194, "epoch": 0.16165123875462567, "grad_norm": 0.11169826239347458, "kl": 0.1670379638671875, "learning_rate": 1e-06, "loss": 0.0295, "step": 931 }, { "clip_ratio/high_max": 0.006810150625824463, "clip_ratio/high_mean": 0.001745838383158116, "clip_ratio/low_mean": 0.0010782483266211784, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002824086703640205, "epoch": 0.16182487059002268, "grad_norm": 0.10373306274414062, "kl": 0.1589202880859375, "learning_rate": 1e-06, "loss": 0.0292, "step": 932 }, { "clip_ratio/high_max": 0.008554877975257114, "clip_ratio/high_mean": 0.0021850516541235265, "clip_ratio/low_mean": 0.0012581567284541961, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034432082707098743, "epoch": 0.1619985024254197, "grad_norm": 0.09929928183555603, "kl": 0.15264892578125, "learning_rate": 1e-06, "loss": 0.0289, "step": 933 }, { "clip_ratio/high_max": 0.009545157998218201, "clip_ratio/high_mean": 0.0024292983366649423, "clip_ratio/low_mean": 0.0015574169672163407, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0039867152818260365, "epoch": 0.16217213426081672, "grad_norm": 0.09333515912294388, "kl": 0.150787353515625, "learning_rate": 1e-06, "loss": 0.0286, "step": 934 }, { "clip_ratio/high_max": 0.011963960631874215, "clip_ratio/high_mean": 0.002952292394866163, "clip_ratio/low_mean": 0.0018751789177713363, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004827471153021179, "epoch": 0.16234576609621373, "grad_norm": 0.09114494919776917, "kl": 0.1477508544921875, "learning_rate": 1e-06, "loss": 0.0284, "step": 935 }, { "clip_ratio/high_max": 0.013327172895515105, "clip_ratio/high_mean": 0.0032485345382156083, "clip_ratio/low_mean": 0.0023809844064999197, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005629518972909864, "epoch": 0.16251939793161077, "grad_norm": 0.08326893299818039, "kl": 0.144744873046875, "learning_rate": 1e-06, "loss": 0.0281, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3013392857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 1576.5335693359375, "completions/mean_terminated_length": 931.52392578125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.16269302976700778, "grad_norm": 0.13788504898548126, "kl": 0.1492462158203125, "learning_rate": 1e-06, "loss": 0.0444, "num_tokens": 98097762.0, "reward": 0.4218750298023224, "reward_std": 0.23702329397201538, "rewards/accuracy_reward/mean": 0.421875, "rewards/accuracy_reward/std": 0.4944108724594116, "step": 937 }, { "clip_ratio/high_max": 0.006676072203845251, "clip_ratio/high_mean": 0.0015688446983403992, "clip_ratio/low_mean": 0.0008852801820466993, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024541248840250773, "epoch": 0.1628666616024048, "grad_norm": 0.11060431599617004, "kl": 0.144866943359375, "learning_rate": 1e-06, "loss": 0.0444, "step": 938 }, { "clip_ratio/high_max": 0.007602396581205539, "clip_ratio/high_mean": 0.001765076855008374, "clip_ratio/low_mean": 0.0009890439685023011, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027541208310140064, "epoch": 0.16304029343780183, "grad_norm": 0.10513715445995331, "kl": 0.14764404296875, "learning_rate": 1e-06, "loss": 0.0442, "step": 939 }, { "clip_ratio/high_max": 0.007901095610577613, "clip_ratio/high_mean": 0.0019411001430853503, "clip_ratio/low_mean": 0.0011568504180559103, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030979505227151094, "epoch": 0.16321392527319883, "grad_norm": 0.09890756756067276, "kl": 0.1473236083984375, "learning_rate": 1e-06, "loss": 0.0439, "step": 940 }, { "clip_ratio/high_max": 0.009869153713225387, "clip_ratio/high_mean": 0.0023990461322682677, "clip_ratio/low_mean": 0.0013546833740747388, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003753729510208359, "epoch": 0.16338755710859587, "grad_norm": 0.09631571173667908, "kl": 0.143524169921875, "learning_rate": 1e-06, "loss": 0.0437, "step": 941 }, { "clip_ratio/high_max": 0.012245953505043872, "clip_ratio/high_mean": 0.0028507435345090926, "clip_ratio/low_mean": 0.0014854727673991874, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0043362163323763525, "epoch": 0.16356118894399288, "grad_norm": 0.09310811758041382, "kl": 0.143157958984375, "learning_rate": 1e-06, "loss": 0.0434, "step": 942 }, { "clip_ratio/high_max": 0.013323418126674369, "clip_ratio/high_mean": 0.003110613462922629, "clip_ratio/low_mean": 0.0018303368131000752, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0049409502553317, "epoch": 0.1637348207793899, "grad_norm": 0.08485394716262817, "kl": 0.143798828125, "learning_rate": 1e-06, "loss": 0.0432, "step": 943 }, { "clip_ratio/high_max": 0.01425111381104216, "clip_ratio/high_mean": 0.0033712294316501357, "clip_ratio/low_mean": 0.002128669803369121, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005499899187270785, "epoch": 0.16390845261478693, "grad_norm": 0.07904484868049622, "kl": 0.14508056640625, "learning_rate": 1e-06, "loss": 0.043, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2433035714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 1527.96435546875, "completions/mean_terminated_length": 1031.50439453125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "epoch": 0.16408208445018393, "grad_norm": 0.13342858850955963, "kl": 0.2054443359375, "learning_rate": 1e-06, "loss": 0.0294, "num_tokens": 98845074.0, "reward": 0.3950892984867096, "reward_std": 0.24626386165618896, "rewards/accuracy_reward/mean": 0.3950892984867096, "rewards/accuracy_reward/std": 0.4894163906574249, "step": 945 }, { "clip_ratio/high_max": 0.005238272500719177, "clip_ratio/high_mean": 0.001341552333542495, "clip_ratio/low_mean": 0.0009257701990463829, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022673225485050352, "epoch": 0.16425571628558097, "grad_norm": 0.11265740543603897, "kl": 0.2053985595703125, "learning_rate": 1e-06, "loss": 0.0295, "step": 946 }, { "clip_ratio/high_max": 0.0061576606349262875, "clip_ratio/high_mean": 0.0015187947674348834, "clip_ratio/low_mean": 0.0009074031599993759, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024261979433504166, "epoch": 0.16442934812097798, "grad_norm": 0.11026738584041595, "kl": 0.186248779296875, "learning_rate": 1e-06, "loss": 0.0293, "step": 947 }, { "clip_ratio/high_max": 0.007468061325198505, "clip_ratio/high_mean": 0.0018304927198187215, "clip_ratio/low_mean": 0.0010908650015153398, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029213576926849782, "epoch": 0.164602979956375, "grad_norm": 0.10403688251972198, "kl": 0.170074462890625, "learning_rate": 1e-06, "loss": 0.0291, "step": 948 }, { "clip_ratio/high_max": 0.008347656330442987, "clip_ratio/high_mean": 0.002040548039076384, "clip_ratio/low_mean": 0.001232708077168354, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0032732561485318, "epoch": 0.16477661179177203, "grad_norm": 0.09716847538948059, "kl": 0.1638336181640625, "learning_rate": 1e-06, "loss": 0.0288, "step": 949 }, { "clip_ratio/high_max": 0.009593476293957792, "clip_ratio/high_mean": 0.002267624979140237, "clip_ratio/low_mean": 0.0015981837168510538, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003865808670525439, "epoch": 0.16495024362716904, "grad_norm": 0.08807893097400665, "kl": 0.1682891845703125, "learning_rate": 1e-06, "loss": 0.0285, "step": 950 }, { "clip_ratio/high_max": 0.010841219060239382, "clip_ratio/high_mean": 0.002565261907875538, "clip_ratio/low_mean": 0.0018245264109282289, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004389788377011428, "epoch": 0.16512387546256607, "grad_norm": 0.08644479513168335, "kl": 0.1653900146484375, "learning_rate": 1e-06, "loss": 0.0283, "step": 951 }, { "clip_ratio/high_max": 0.011645914739347063, "clip_ratio/high_mean": 0.002802307019010186, "clip_ratio/low_mean": 0.00213338018511422, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004935687127726851, "epoch": 0.16529750729796308, "grad_norm": 0.08262400329113007, "kl": 0.168304443359375, "learning_rate": 1e-06, "loss": 0.0281, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2924107142857143, "completions/max_length": 3072.0, "completions/max_terminated_length": 2988.0, "completions/mean_length": 1585.35498046875, "completions/mean_terminated_length": 971.0, "completions/min_length": 284.0, "completions/min_terminated_length": 284.0, "epoch": 0.1654711391333601, "grad_norm": 0.16742214560508728, "kl": 0.2476806640625, "learning_rate": 1e-06, "loss": 0.0424, "num_tokens": 99614089.0, "reward": 0.4375000298023224, "reward_std": 0.2677595615386963, "rewards/accuracy_reward/mean": 0.4375, "rewards/accuracy_reward/std": 0.49663296341896057, "step": 953 }, { "clip_ratio/high_max": 0.0074750677740667015, "clip_ratio/high_mean": 0.0019572647152017453, "clip_ratio/low_mean": 0.000668495369154698, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026257600766257383, "epoch": 0.16564477096875713, "grad_norm": 0.11842747032642365, "kl": 0.200653076171875, "learning_rate": 1e-06, "loss": 0.0424, "step": 954 }, { "clip_ratio/high_max": 0.008457405892841052, "clip_ratio/high_mean": 0.0022785109067626763, "clip_ratio/low_mean": 0.0007285682509063918, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0030070791581238154, "epoch": 0.16581840280415414, "grad_norm": 0.12092433124780655, "kl": 0.177276611328125, "learning_rate": 1e-06, "loss": 0.0423, "step": 955 }, { "clip_ratio/high_max": 0.010716812641476281, "clip_ratio/high_mean": 0.0027578262252063723, "clip_ratio/low_mean": 0.0007974519007802883, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035552781300793868, "epoch": 0.16599203463955117, "grad_norm": 0.11912057548761368, "kl": 0.1709747314453125, "learning_rate": 1e-06, "loss": 0.042, "step": 956 }, { "clip_ratio/high_max": 0.011173059727298096, "clip_ratio/high_mean": 0.0029642793833772885, "clip_ratio/low_mean": 0.0010529854807828087, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004017264833237277, "epoch": 0.16616566647494818, "grad_norm": 0.1084752157330513, "kl": 0.1572113037109375, "learning_rate": 1e-06, "loss": 0.0417, "step": 957 }, { "clip_ratio/high_max": 0.011986806581262499, "clip_ratio/high_mean": 0.003073246245548944, "clip_ratio/low_mean": 0.0014954653925087769, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004568711665342562, "epoch": 0.1663392983103452, "grad_norm": 0.10028492659330368, "kl": 0.1546478271484375, "learning_rate": 1e-06, "loss": 0.0414, "step": 958 }, { "clip_ratio/high_max": 0.011979561739281053, "clip_ratio/high_mean": 0.0032052998440121883, "clip_ratio/low_mean": 0.0018677391162782442, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005073038904811256, "epoch": 0.16651293014574223, "grad_norm": 0.10366502404212952, "kl": 0.1610260009765625, "learning_rate": 1e-06, "loss": 0.0412, "step": 959 }, { "clip_ratio/high_max": 0.014232169050956145, "clip_ratio/high_mean": 0.00370902888062119, "clip_ratio/low_mean": 0.0021720106497014058, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005881039454834536, "epoch": 0.16668656198113924, "grad_norm": 0.09346062690019608, "kl": 0.149932861328125, "learning_rate": 1e-06, "loss": 0.0409, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2544642857142857, "completions/max_length": 3072.0, "completions/max_terminated_length": 2955.0, "completions/mean_length": 1489.1429443359375, "completions/mean_terminated_length": 948.8862915039062, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.16686019381653627, "grad_norm": 0.11045411974191666, "kl": 0.1573333740234375, "learning_rate": 1e-06, "loss": 0.0334, "num_tokens": 100345465.0, "reward": 0.3727678656578064, "reward_std": 0.209291011095047, "rewards/accuracy_reward/mean": 0.3727678656578064, "rewards/accuracy_reward/std": 0.4840816557407379, "step": 961 }, { "clip_ratio/high_max": 0.004441621473233681, "clip_ratio/high_mean": 0.0010502937780074717, "clip_ratio/low_mean": 0.0007242553293735909, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0017745490831657662, "epoch": 0.16703382565193328, "grad_norm": 0.10251028090715408, "kl": 0.151611328125, "learning_rate": 1e-06, "loss": 0.0335, "step": 962 }, { "clip_ratio/high_max": 0.005222318339292542, "clip_ratio/high_mean": 0.0012088555456557515, "clip_ratio/low_mean": 0.0009155230770829803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021243785527076398, "epoch": 0.1672074574873303, "grad_norm": 0.10151498764753342, "kl": 0.14337158203125, "learning_rate": 1e-06, "loss": 0.0333, "step": 963 }, { "clip_ratio/high_max": 0.005752219112764578, "clip_ratio/high_mean": 0.0013443723751151992, "clip_ratio/low_mean": 0.0010966981017190847, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024410705200352822, "epoch": 0.16738108932272733, "grad_norm": 0.09546945989131927, "kl": 0.1396636962890625, "learning_rate": 1e-06, "loss": 0.0331, "step": 964 }, { "clip_ratio/high_max": 0.006459886664742953, "clip_ratio/high_mean": 0.0014497532010864234, "clip_ratio/low_mean": 0.0013606243392132455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002810377524838259, "epoch": 0.16755472115812434, "grad_norm": 0.08871138840913773, "kl": 0.1439971923828125, "learning_rate": 1e-06, "loss": 0.0329, "step": 965 }, { "clip_ratio/high_max": 0.007616848784891772, "clip_ratio/high_mean": 0.0016943664481914311, "clip_ratio/low_mean": 0.001693544703812222, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0033879111133501283, "epoch": 0.16772835299352137, "grad_norm": 0.0800594687461853, "kl": 0.1440277099609375, "learning_rate": 1e-06, "loss": 0.0327, "step": 966 }, { "clip_ratio/high_max": 0.00861555489973398, "clip_ratio/high_mean": 0.0018403172016405733, "clip_ratio/low_mean": 0.0019116056605525955, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0037519228744713473, "epoch": 0.16790198482891838, "grad_norm": 0.07679668813943863, "kl": 0.13885498046875, "learning_rate": 1e-06, "loss": 0.0325, "step": 967 }, { "clip_ratio/high_max": 0.010169656130528892, "clip_ratio/high_mean": 0.002172438665638765, "clip_ratio/low_mean": 0.00214742449543337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004319863143791736, "epoch": 0.1680756166643154, "grad_norm": 0.07163725048303604, "kl": 0.1384429931640625, "learning_rate": 1e-06, "loss": 0.0323, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2589285714285714, "completions/max_length": 3072.0, "completions/max_terminated_length": 2991.0, "completions/mean_length": 1575.7232666015625, "completions/mean_terminated_length": 1052.9276123046875, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "epoch": 0.16824924849971243, "grad_norm": 0.11902888864278793, "kl": 0.1672515869140625, "learning_rate": 1e-06, "loss": 0.0383, "num_tokens": 101115477.0, "reward": 0.3058035969734192, "reward_std": 0.2532518804073334, "rewards/accuracy_reward/mean": 0.3058035671710968, "rewards/accuracy_reward/std": 0.4612620174884796, "step": 969 }, { "clip_ratio/high_max": 0.004215360608213814, "clip_ratio/high_mean": 0.0011138634922644997, "clip_ratio/low_mean": 0.0007115052903827745, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018253687858305057, "epoch": 0.16842288033510944, "grad_norm": 0.11096233874559402, "kl": 0.1635894775390625, "learning_rate": 1e-06, "loss": 0.0384, "step": 970 }, { "clip_ratio/high_max": 0.005229889815382194, "clip_ratio/high_mean": 0.0013909062718084897, "clip_ratio/low_mean": 0.0008030884952177075, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021939947737337206, "epoch": 0.16859651217050647, "grad_norm": 0.10653910040855408, "kl": 0.1583709716796875, "learning_rate": 1e-06, "loss": 0.0382, "step": 971 }, { "clip_ratio/high_max": 0.005909475403313991, "clip_ratio/high_mean": 0.0015959556294546928, "clip_ratio/low_mean": 0.0009495901540503837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025455457744101295, "epoch": 0.16877014400590348, "grad_norm": 0.09821835905313492, "kl": 0.1543426513671875, "learning_rate": 1e-06, "loss": 0.0379, "step": 972 }, { "clip_ratio/high_max": 0.006430046465538908, "clip_ratio/high_mean": 0.0017393864982295781, "clip_ratio/low_mean": 0.001276693526051531, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003016079994267784, "epoch": 0.1689437758413005, "grad_norm": 0.09009062498807907, "kl": 0.1585235595703125, "learning_rate": 1e-06, "loss": 0.0377, "step": 973 }, { "clip_ratio/high_max": 0.007472614564903779, "clip_ratio/high_mean": 0.00203917193266534, "clip_ratio/low_mean": 0.0014998206279415172, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003538992583344225, "epoch": 0.16911740767669753, "grad_norm": 0.08960425108671188, "kl": 0.152679443359375, "learning_rate": 1e-06, "loss": 0.0374, "step": 974 }, { "clip_ratio/high_max": 0.008781807828199817, "clip_ratio/high_mean": 0.00235638231424673, "clip_ratio/low_mean": 0.0019509818666847423, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0043073642045783345, "epoch": 0.16929103951209454, "grad_norm": 0.08150483667850494, "kl": 0.1522216796875, "learning_rate": 1e-06, "loss": 0.0372, "step": 975 }, { "clip_ratio/high_max": 0.009548240108415484, "clip_ratio/high_mean": 0.0025925098052539397, "clip_ratio/low_mean": 0.002303680541444919, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004896190377621679, "epoch": 0.16946467134749157, "grad_norm": 0.07574442774057388, "kl": 0.1498870849609375, "learning_rate": 1e-06, "loss": 0.037, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2522321428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 1438.1585693359375, "completions/mean_terminated_length": 887.0418090820312, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.16963830318288858, "grad_norm": 0.47625449299812317, "kl": 0.41717529296875, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 101821852.0, "reward": 0.3683035969734192, "reward_std": 0.2217673659324646, "rewards/accuracy_reward/mean": 0.3683035671710968, "rewards/accuracy_reward/std": 0.4828835427761078, "step": 977 }, { "clip_ratio/high_max": 0.005579592965659685, "clip_ratio/high_mean": 0.0013471047855091456, "clip_ratio/low_mean": 0.0008018513453862397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021489561131602386, "epoch": 0.1698119350182856, "grad_norm": 0.1158551275730133, "kl": 0.1793975830078125, "learning_rate": 1e-06, "loss": 0.048, "step": 978 }, { "clip_ratio/high_max": 0.0059739747157436796, "clip_ratio/high_mean": 0.00144659539046188, "clip_ratio/low_mean": 0.001046387033056817, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002492982424882939, "epoch": 0.16998556685368263, "grad_norm": 0.11413933336734772, "kl": 0.1797332763671875, "learning_rate": 1e-06, "loss": 0.0478, "step": 979 }, { "clip_ratio/high_max": 0.007425853998938692, "clip_ratio/high_mean": 0.0017334058645701589, "clip_ratio/low_mean": 0.0013304866242833668, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003063892505451804, "epoch": 0.17015919868907964, "grad_norm": 0.11402644962072372, "kl": 0.1752777099609375, "learning_rate": 1e-06, "loss": 0.0475, "step": 980 }, { "clip_ratio/high_max": 0.008645722507935716, "clip_ratio/high_mean": 0.001988822595194506, "clip_ratio/low_mean": 0.001576217945512326, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003565040527973906, "epoch": 0.17033283052447667, "grad_norm": 0.11177162826061249, "kl": 0.1636199951171875, "learning_rate": 1e-06, "loss": 0.0473, "step": 981 }, { "clip_ratio/high_max": 0.01070775734660856, "clip_ratio/high_mean": 0.0024708635962724657, "clip_ratio/low_mean": 0.001877494748441677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00434835839405423, "epoch": 0.17050646235987368, "grad_norm": 0.10119596123695374, "kl": 0.1494903564453125, "learning_rate": 1e-06, "loss": 0.047, "step": 982 }, { "clip_ratio/high_max": 0.01218362871077261, "clip_ratio/high_mean": 0.0027881871187673823, "clip_ratio/low_mean": 0.00213623081890546, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004924417868096498, "epoch": 0.1706800941952707, "grad_norm": 0.09608577936887741, "kl": 0.1410675048828125, "learning_rate": 1e-06, "loss": 0.0468, "step": 983 }, { "clip_ratio/high_max": 0.013543869976274436, "clip_ratio/high_mean": 0.0031086665590009943, "clip_ratio/low_mean": 0.002462121772168757, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005570788283876027, "epoch": 0.17085372603066773, "grad_norm": 0.09014745056629181, "kl": 0.1391448974609375, "learning_rate": 1e-06, "loss": 0.0465, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2991071428571429, "completions/max_length": 3072.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 1570.716552734375, "completions/mean_terminated_length": 930.0414428710938, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.17102735786606474, "grad_norm": 0.1305747926235199, "kl": 0.1369171142578125, "learning_rate": 1e-06, "loss": 0.0285, "num_tokens": 102588325.0, "reward": 0.3750000298023224, "reward_std": 0.2228986620903015, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.48466411232948303, "step": 985 }, { "clip_ratio/high_max": 0.004864733062277082, "clip_ratio/high_mean": 0.001148228885995195, "clip_ratio/low_mean": 0.0008689296046213713, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020171585247226176, "epoch": 0.17120098970146178, "grad_norm": 0.11617166548967361, "kl": 0.1342620849609375, "learning_rate": 1e-06, "loss": 0.0285, "step": 986 }, { "clip_ratio/high_max": 0.005877829476958141, "clip_ratio/high_mean": 0.0013497024528987822, "clip_ratio/low_mean": 0.00096016524935294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0023098677138477797, "epoch": 0.17137462153685878, "grad_norm": 0.11007830500602722, "kl": 0.1356964111328125, "learning_rate": 1e-06, "loss": 0.0283, "step": 987 }, { "clip_ratio/high_max": 0.007923968805698678, "clip_ratio/high_mean": 0.0018167609960073605, "clip_ratio/low_mean": 0.0010725816371177643, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002889342626076541, "epoch": 0.1715482533722558, "grad_norm": 0.10499568283557892, "kl": 0.127105712890625, "learning_rate": 1e-06, "loss": 0.028, "step": 988 }, { "clip_ratio/high_max": 0.008725694417080376, "clip_ratio/high_mean": 0.002021379696088843, "clip_ratio/low_mean": 0.0014043664896234986, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00342574614478508, "epoch": 0.17172188520765283, "grad_norm": 0.09945791959762573, "kl": 0.126129150390625, "learning_rate": 1e-06, "loss": 0.0278, "step": 989 }, { "clip_ratio/high_max": 0.009398638918355573, "clip_ratio/high_mean": 0.002118518804309133, "clip_ratio/low_mean": 0.0018180987435698626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003936617540603038, "epoch": 0.17189551704304984, "grad_norm": 0.09258674830198288, "kl": 0.1265106201171875, "learning_rate": 1e-06, "loss": 0.0274, "step": 990 }, { "clip_ratio/high_max": 0.010709733054682147, "clip_ratio/high_mean": 0.0024548954752390273, "clip_ratio/low_mean": 0.0021232818844509893, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004578177453367971, "epoch": 0.17206914887844688, "grad_norm": 0.08520049601793289, "kl": 0.125244140625, "learning_rate": 1e-06, "loss": 0.0272, "step": 991 }, { "clip_ratio/high_max": 0.012825590136344545, "clip_ratio/high_mean": 0.002906436677221791, "clip_ratio/low_mean": 0.0023991291091078892, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005305565729941009, "epoch": 0.17224278071384388, "grad_norm": 0.08209982514381409, "kl": 0.12347412109375, "learning_rate": 1e-06, "loss": 0.0269, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 3072.0, "completions/max_terminated_length": 2965.0, "completions/mean_length": 1611.13623046875, "completions/mean_terminated_length": 1082.7386474609375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.1724164125492409, "grad_norm": 0.1432744562625885, "kl": 0.2009429931640625, "learning_rate": 1e-06, "loss": 0.0147, "num_tokens": 103376898.0, "reward": 0.3482142984867096, "reward_std": 0.25949329137802124, "rewards/accuracy_reward/mean": 0.3482142984867096, "rewards/accuracy_reward/std": 0.476936936378479, "step": 993 }, { "clip_ratio/high_max": 0.004854136335779913, "clip_ratio/high_mean": 0.0012013352697977098, "clip_ratio/low_mean": 0.0008651439479763212, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020664792391471565, "epoch": 0.17259004438463793, "grad_norm": 0.1227901503443718, "kl": 0.1893310546875, "learning_rate": 1e-06, "loss": 0.0147, "step": 994 }, { "clip_ratio/high_max": 0.005372144907596521, "clip_ratio/high_mean": 0.0012955834067724936, "clip_ratio/low_mean": 0.0012107935572203132, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0025063769244297873, "epoch": 0.17276367622003494, "grad_norm": 0.11688683927059174, "kl": 0.1640472412109375, "learning_rate": 1e-06, "loss": 0.0146, "step": 995 }, { "clip_ratio/high_max": 0.006870652665384114, "clip_ratio/high_mean": 0.0015838507247281086, "clip_ratio/low_mean": 0.001497666465184011, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003081517203099793, "epoch": 0.17293730805543198, "grad_norm": 0.11225875467061996, "kl": 0.1633453369140625, "learning_rate": 1e-06, "loss": 0.0142, "step": 996 }, { "clip_ratio/high_max": 0.008497616989188828, "clip_ratio/high_mean": 0.0019248588614573237, "clip_ratio/low_mean": 0.0016637114704280975, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0035885703491658205, "epoch": 0.17311093989082899, "grad_norm": 0.1010153591632843, "kl": 0.156951904296875, "learning_rate": 1e-06, "loss": 0.0139, "step": 997 }, { "clip_ratio/high_max": 0.009253346670448082, "clip_ratio/high_mean": 0.0021307615375008027, "clip_ratio/low_mean": 0.002294961070674617, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00442572259271401, "epoch": 0.173284571726226, "grad_norm": 0.09036517888307571, "kl": 0.1616363525390625, "learning_rate": 1e-06, "loss": 0.0136, "step": 998 }, { "clip_ratio/high_max": 0.010810750158270821, "clip_ratio/high_mean": 0.002423984507458954, "clip_ratio/low_mean": 0.0027702523602783913, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005194236828174326, "epoch": 0.17345820356162303, "grad_norm": 0.08309189230203629, "kl": 0.1592559814453125, "learning_rate": 1e-06, "loss": 0.0134, "step": 999 }, { "clip_ratio/high_max": 0.011896438270923682, "clip_ratio/high_mean": 0.002669459651770012, "clip_ratio/low_mean": 0.003338249123771675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006007708787365118, "epoch": 0.17363183539702004, "grad_norm": 0.08024711906909943, "kl": 0.1649932861328125, "learning_rate": 1e-06, "loss": 0.0131, "step": 1000 }, { "epoch": 0.17363183539702004, "step": 1000, "total_flos": 0.0, "train_loss": 0.037211290190927684, "train_runtime": 19322.5677, "train_samples_per_second": 23.185, "train_steps_per_second": 0.052 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 103376898, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }