{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 882, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 838.1328125, "epoch": 0.0011337868480725624, "grad_norm": 0.012316840700805187, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 138961.0, "rewards/KL_reward/mean": 0.0, "rewards/KL_reward/std": 0.0, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.03760823979973793, "rewards/angle_reward/std": 0.7318449020385742, "rewards/thinking_verbosity_reward/mean": -1.42296302318573, "rewards/thinking_verbosity_reward/std": 0.26177090406417847, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 840.8203125, "epoch": 0.0022675736961451248, "grad_norm": 0.014643240720033646, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 278170.0, "rewards/KL_reward/mean": -0.00014835168258287013, "rewards/KL_reward/std": 0.0017017334466800094, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.07582557946443558, "rewards/angle_reward/std": 0.7120820879936218, "rewards/thinking_verbosity_reward/mean": -1.429264783859253, "rewards/thinking_verbosity_reward/std": 0.23895855247974396, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1029.5078125, "epoch": 0.003401360544217687, "grad_norm": 0.013309244997799397, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 442091.0, "rewards/KL_reward/mean": -0.00011519622057676315, "rewards/KL_reward/std": 0.001265935366973281, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.04690488427877426, "rewards/angle_reward/std": 0.708452582359314, "rewards/thinking_verbosity_reward/mean": -1.560727834701538, "rewards/thinking_verbosity_reward/std": 0.36943718791007996, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 827.4375, "epoch": 0.0045351473922902496, "grad_norm": 0.016940688714385033, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 580019.0, "rewards/KL_reward/mean": -2.250075340270996e-06, "rewards/KL_reward/std": 0.0013977407943457365, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": -0.018481537699699402, "rewards/angle_reward/std": 0.7089322805404663, "rewards/thinking_verbosity_reward/mean": -1.4036836624145508, "rewards/thinking_verbosity_reward/std": 0.31064456701278687, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 898.796875, "epoch": 0.005668934240362812, "grad_norm": 0.012756886892020702, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 726905.0, "rewards/KL_reward/mean": -0.00013490879791788757, "rewards/KL_reward/std": 0.0015800945693627, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": 0.014750942587852478, "rewards/angle_reward/std": 0.7069280743598938, "rewards/thinking_verbosity_reward/mean": -1.478535771369934, "rewards/thinking_verbosity_reward/std": 0.2427017092704773, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 992.28125, "epoch": 0.006802721088435374, "grad_norm": 0.014865963719785213, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 885477.0, "rewards/KL_reward/mean": -0.00015435644309036434, "rewards/KL_reward/std": 0.0012532330583781004, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.09406879544258118, "rewards/angle_reward/std": 0.7392042875289917, "rewards/thinking_verbosity_reward/mean": -1.5456072092056274, "rewards/thinking_verbosity_reward/std": 0.2998106777667999, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 911.109375, "epoch": 0.007936507936507936, "grad_norm": 0.014299380593001842, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 1034707.0, "rewards/KL_reward/mean": -8.136368705891073e-05, "rewards/KL_reward/std": 0.0018703237874433398, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22826264798641205, "rewards/angle_reward/mean": -0.015537131577730179, "rewards/angle_reward/std": 0.6837882995605469, "rewards/thinking_verbosity_reward/mean": -1.476283311843872, "rewards/thinking_verbosity_reward/std": 0.31067222356796265, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 864.1796875, "epoch": 0.009070294784580499, "grad_norm": 0.014539482071995735, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 1177298.0, "rewards/KL_reward/mean": -0.0003884948091581464, "rewards/KL_reward/std": 0.001967003336176276, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.06809777766466141, "rewards/angle_reward/std": 0.6925191283226013, "rewards/thinking_verbosity_reward/mean": -1.4459242820739746, "rewards/thinking_verbosity_reward/std": 0.26023587584495544, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 850.6171875, "epoch": 0.01020408163265306, "grad_norm": 0.015095122158527374, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 1318273.0, "rewards/KL_reward/mean": -0.00043918390292674303, "rewards/KL_reward/std": 0.0015299812657758594, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.01057947427034378, "rewards/angle_reward/std": 0.7031927108764648, "rewards/thinking_verbosity_reward/mean": -1.4381227493286133, "rewards/thinking_verbosity_reward/std": 0.23724676668643951, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 838.921875, "epoch": 0.011337868480725623, "grad_norm": 0.016048606485128403, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 1457487.0, "rewards/KL_reward/mean": -0.0006849928759038448, "rewards/KL_reward/std": 0.0018519391305744648, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": 0.014378756284713745, "rewards/angle_reward/std": 0.7116018533706665, "rewards/thinking_verbosity_reward/mean": -1.4114718437194824, "rewards/thinking_verbosity_reward/std": 0.3214956521987915, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 829.4453125, "epoch": 0.012471655328798186, "grad_norm": 0.015514638274908066, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 1596120.0, "rewards/KL_reward/mean": -0.0008332775323651731, "rewards/KL_reward/std": 0.0021017943508923054, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.020728295668959618, "rewards/angle_reward/std": 0.6793127655982971, "rewards/thinking_verbosity_reward/mean": -1.4123082160949707, "rewards/thinking_verbosity_reward/std": 0.2772451639175415, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 751.203125, "epoch": 0.013605442176870748, "grad_norm": 0.015006035566329956, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 1724122.0, "rewards/KL_reward/mean": -0.0006982996128499508, "rewards/KL_reward/std": 0.0018833853537216783, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.042627930641174316, "rewards/angle_reward/std": 0.7245418429374695, "rewards/thinking_verbosity_reward/mean": -1.350346565246582, "rewards/thinking_verbosity_reward/std": 0.22878825664520264, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 844.7578125, "epoch": 0.01473922902494331, "grad_norm": 0.014846655540168285, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 1864179.0, "rewards/KL_reward/mean": -0.0004108635475859046, "rewards/KL_reward/std": 0.00215906766243279, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.03426017612218857, "rewards/angle_reward/std": 0.7090725302696228, "rewards/thinking_verbosity_reward/mean": -1.435034990310669, "rewards/thinking_verbosity_reward/std": 0.22442129254341125, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 829.796875, "epoch": 0.015873015873015872, "grad_norm": 0.019330566748976707, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 2002601.0, "rewards/KL_reward/mean": -0.0008955916855484247, "rewards/KL_reward/std": 0.002487297635525465, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": 0.008562322705984116, "rewards/angle_reward/std": 0.699159562587738, "rewards/thinking_verbosity_reward/mean": -1.4030687808990479, "rewards/thinking_verbosity_reward/std": 0.32274603843688965, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 879.6875, "epoch": 0.017006802721088437, "grad_norm": 0.01438729465007782, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 2147689.0, "rewards/KL_reward/mean": -0.000829770986456424, "rewards/KL_reward/std": 0.002734147710725665, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.05539670214056969, "rewards/angle_reward/std": 0.7079266905784607, "rewards/thinking_verbosity_reward/mean": -1.4588990211486816, "rewards/thinking_verbosity_reward/std": 0.2622072398662567, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 874.1484375, "epoch": 0.018140589569160998, "grad_norm": 0.012773919850587845, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 2292084.0, "rewards/KL_reward/mean": -0.0006347743328660727, "rewards/KL_reward/std": 0.0020726905204355717, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.02527463063597679, "rewards/angle_reward/std": 0.7010154724121094, "rewards/thinking_verbosity_reward/mean": -1.4546244144439697, "rewards/thinking_verbosity_reward/std": 0.2597421705722809, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 857.4375, "epoch": 0.01927437641723356, "grad_norm": 0.015292557887732983, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 2433140.0, "rewards/KL_reward/mean": -0.00018716679187491536, "rewards/KL_reward/std": 0.0016854844288900495, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": 0.08626461029052734, "rewards/angle_reward/std": 0.7406377196311951, "rewards/thinking_verbosity_reward/mean": -1.4333903789520264, "rewards/thinking_verbosity_reward/std": 0.29524025321006775, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 810.0078125, "epoch": 0.02040816326530612, "grad_norm": 0.013641524128615856, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 2568493.0, "rewards/KL_reward/mean": -0.0009980101604014635, "rewards/KL_reward/std": 0.0025766361504793167, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": -0.04107923433184624, "rewards/angle_reward/std": 0.7071124315261841, "rewards/thinking_verbosity_reward/mean": -1.3919694423675537, "rewards/thinking_verbosity_reward/std": 0.2925904393196106, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 833.6484375, "epoch": 0.021541950113378686, "grad_norm": 0.01295141689479351, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 2707016.0, "rewards/KL_reward/mean": -0.0005243468331173062, "rewards/KL_reward/std": 0.001648963545449078, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.4513758420944214, "rewards/angle_reward/mean": 0.04255552589893341, "rewards/angle_reward/std": 0.6983751654624939, "rewards/thinking_verbosity_reward/mean": -1.4239087104797363, "rewards/thinking_verbosity_reward/std": 0.2333919256925583, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 698.40625, "epoch": 0.022675736961451247, "grad_norm": 0.01779448799788952, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 2828884.0, "rewards/KL_reward/mean": -0.0006695782649330795, "rewards/KL_reward/std": 0.003075928892940283, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": -0.06875322014093399, "rewards/angle_reward/std": 0.702839732170105, "rewards/thinking_verbosity_reward/mean": -1.2984439134597778, "rewards/thinking_verbosity_reward/std": 0.24081647396087646, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 830.3984375, "epoch": 0.023809523809523808, "grad_norm": 0.019104426726698875, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 2967359.0, "rewards/KL_reward/mean": -0.00040592922596260905, "rewards/KL_reward/std": 0.0025070447009056807, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": -0.03683070093393326, "rewards/angle_reward/std": 0.7420152425765991, "rewards/thinking_verbosity_reward/mean": -1.4153430461883545, "rewards/thinking_verbosity_reward/std": 0.26599904894828796, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 788.46875, "epoch": 0.024943310657596373, "grad_norm": 0.014593786559998989, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 3100163.0, "rewards/KL_reward/mean": -0.0007708030752837658, "rewards/KL_reward/std": 0.001973372884094715, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.08793884515762329, "rewards/angle_reward/std": 0.728395402431488, "rewards/thinking_verbosity_reward/mean": -1.3775405883789062, "rewards/thinking_verbosity_reward/std": 0.26757028698921204, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 895.7421875, "epoch": 0.026077097505668934, "grad_norm": 0.01430218294262886, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 3246506.0, "rewards/KL_reward/mean": -0.0006098577287048101, "rewards/KL_reward/std": 0.001767677254974842, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22826264798641205, "rewards/angle_reward/mean": 0.06294762343168259, "rewards/angle_reward/std": 0.6941813230514526, "rewards/thinking_verbosity_reward/mean": -1.4643688201904297, "rewards/thinking_verbosity_reward/std": 0.30500882863998413, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 891.09375, "epoch": 0.027210884353741496, "grad_norm": 0.01346637960523367, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 3392310.0, "rewards/KL_reward/mean": -0.0006486388156190515, "rewards/KL_reward/std": 0.0017589215422049165, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.020684920251369476, "rewards/angle_reward/std": 0.7452264428138733, "rewards/thinking_verbosity_reward/mean": -1.4671560525894165, "rewards/thinking_verbosity_reward/std": 0.2706656754016876, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 783.2578125, "epoch": 0.02834467120181406, "grad_norm": 0.015406543388962746, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 3524327.0, "rewards/KL_reward/mean": -0.0008891090401448309, "rewards/KL_reward/std": 0.002750263549387455, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.07052788138389587, "rewards/angle_reward/std": 0.6934146285057068, "rewards/thinking_verbosity_reward/mean": -1.3774542808532715, "rewards/thinking_verbosity_reward/std": 0.24232842028141022, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 907.2578125, "epoch": 0.02947845804988662, "grad_norm": 0.0155716547742486, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 3671576.0, "rewards/KL_reward/mean": -0.0009992108680307865, "rewards/KL_reward/std": 0.001991155557334423, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.0619647316634655, "rewards/angle_reward/std": 0.713699221611023, "rewards/thinking_verbosity_reward/mean": -1.463057518005371, "rewards/thinking_verbosity_reward/std": 0.355150431394577, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 807.40625, "epoch": 0.030612244897959183, "grad_norm": 0.013339817523956299, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 3806604.0, "rewards/KL_reward/mean": -0.0007779946317896247, "rewards/KL_reward/std": 0.0016896483721211553, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": 0.05560939013957977, "rewards/angle_reward/std": 0.6921024322509766, "rewards/thinking_verbosity_reward/mean": -1.390915870666504, "rewards/thinking_verbosity_reward/std": 0.28644102811813354, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 797.8984375, "epoch": 0.031746031746031744, "grad_norm": 0.017945559695363045, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 3941207.0, "rewards/KL_reward/mean": -0.0016435969155281782, "rewards/KL_reward/std": 0.003532285103574395, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.023568615317344666, "rewards/angle_reward/std": 0.7035307288169861, "rewards/thinking_verbosity_reward/mean": -1.3779566287994385, "rewards/thinking_verbosity_reward/std": 0.3068205416202545, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 770.296875, "epoch": 0.032879818594104306, "grad_norm": 0.016661623492836952, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 4072077.0, "rewards/KL_reward/mean": -0.0008901930414140224, "rewards/KL_reward/std": 0.0017296381993219256, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45510825514793396, "rewards/angle_reward/mean": 0.03749765455722809, "rewards/angle_reward/std": 0.7588507533073425, "rewards/thinking_verbosity_reward/mean": -1.3564424514770508, "rewards/thinking_verbosity_reward/std": 0.28972840309143066, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 737.8671875, "epoch": 0.034013605442176874, "grad_norm": 0.017227329313755035, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 4198900.0, "rewards/KL_reward/mean": -0.0016070909332484007, "rewards/KL_reward/std": 0.0026566628366708755, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.08105626702308655, "rewards/angle_reward/std": 0.7346787452697754, "rewards/thinking_verbosity_reward/mean": -1.3244327306747437, "rewards/thinking_verbosity_reward/std": 0.29772624373435974, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 743.7578125, "epoch": 0.035147392290249435, "grad_norm": 0.015487901866436005, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 4325405.0, "rewards/KL_reward/mean": -0.001097002881579101, "rewards/KL_reward/std": 0.0027626247610896826, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.017529528588056564, "rewards/angle_reward/std": 0.737390398979187, "rewards/thinking_verbosity_reward/mean": -1.336993932723999, "rewards/thinking_verbosity_reward/std": 0.2641395628452301, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 817.65625, "epoch": 0.036281179138321996, "grad_norm": 0.014715392142534256, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 4462409.0, "rewards/KL_reward/mean": -0.0015238930936902761, "rewards/KL_reward/std": 0.0031906100921332836, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.08617420494556427, "rewards/angle_reward/std": 0.703201174736023, "rewards/thinking_verbosity_reward/mean": -1.3977710008621216, "rewards/thinking_verbosity_reward/std": 0.2975722551345825, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 816.140625, "epoch": 0.03741496598639456, "grad_norm": 0.014042104594409466, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 4599411.0, "rewards/KL_reward/mean": -0.001561877434141934, "rewards/KL_reward/std": 0.0028319573029875755, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.0055811963975429535, "rewards/angle_reward/std": 0.7309610843658447, "rewards/thinking_verbosity_reward/mean": -1.3999909162521362, "rewards/thinking_verbosity_reward/std": 0.2801964282989502, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 885.375, "epoch": 0.03854875283446712, "grad_norm": 0.013550758361816406, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 4744619.0, "rewards/KL_reward/mean": -0.0006970397080294788, "rewards/KL_reward/std": 0.002382143633440137, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.08778101950883865, "rewards/angle_reward/std": 0.7114675641059875, "rewards/thinking_verbosity_reward/mean": -1.4606428146362305, "rewards/thinking_verbosity_reward/std": 0.27926936745643616, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 827.9921875, "epoch": 0.03968253968253968, "grad_norm": 0.013626561500132084, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 4882898.0, "rewards/KL_reward/mean": -0.0010192693443968892, "rewards/KL_reward/std": 0.0023207853082567453, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": 0.06453114748001099, "rewards/angle_reward/std": 0.6974670886993408, "rewards/thinking_verbosity_reward/mean": -1.4110902547836304, "rewards/thinking_verbosity_reward/std": 0.27717676758766174, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 615.5234375, "epoch": 0.04081632653061224, "grad_norm": 0.016563985496759415, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 4992893.0, "rewards/KL_reward/mean": -0.001065884716808796, "rewards/KL_reward/std": 0.0026898810174316168, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45510825514793396, "rewards/angle_reward/mean": 0.03843190148472786, "rewards/angle_reward/std": 0.7020615935325623, "rewards/thinking_verbosity_reward/mean": -1.2185498476028442, "rewards/thinking_verbosity_reward/std": 0.22774890065193176, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 735.5078125, "epoch": 0.04195011337868481, "grad_norm": 0.01565818302333355, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 5119166.0, "rewards/KL_reward/mean": -0.0016780947335064411, "rewards/KL_reward/std": 0.0030078997369855642, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.43914902210235596, "rewards/angle_reward/mean": -0.000943564809858799, "rewards/angle_reward/std": 0.7384704947471619, "rewards/thinking_verbosity_reward/mean": -1.3351643085479736, "rewards/thinking_verbosity_reward/std": 0.2324395775794983, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 759.9375, "epoch": 0.04308390022675737, "grad_norm": 0.01692848466336727, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 5248158.0, "rewards/KL_reward/mean": -0.0013938448391854763, "rewards/KL_reward/std": 0.0023707833606749773, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.02276681549847126, "rewards/angle_reward/std": 0.735538899898529, "rewards/thinking_verbosity_reward/mean": -1.3573532104492188, "rewards/thinking_verbosity_reward/std": 0.2352214753627777, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 854.5234375, "epoch": 0.04421768707482993, "grad_norm": 0.01324634999036789, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 5389617.0, "rewards/KL_reward/mean": -0.0013230672338977456, "rewards/KL_reward/std": 0.002388726221397519, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.0635545626282692, "rewards/angle_reward/std": 0.7063141465187073, "rewards/thinking_verbosity_reward/mean": -1.4416253566741943, "rewards/thinking_verbosity_reward/std": 0.2365209311246872, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 804.671875, "epoch": 0.045351473922902494, "grad_norm": 0.015133047476410866, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 5524679.0, "rewards/KL_reward/mean": -0.001572504872456193, "rewards/KL_reward/std": 0.002796533051878214, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.05160997435450554, "rewards/angle_reward/std": 0.7147625684738159, "rewards/thinking_verbosity_reward/mean": -1.3938100337982178, "rewards/thinking_verbosity_reward/std": 0.2584945261478424, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 747.8046875, "epoch": 0.046485260770975055, "grad_norm": 0.016320038586854935, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 5652870.0, "rewards/KL_reward/mean": -0.0015140497125685215, "rewards/KL_reward/std": 0.0029329690150916576, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.02412802167236805, "rewards/angle_reward/std": 0.7106620073318481, "rewards/thinking_verbosity_reward/mean": -1.340445876121521, "rewards/thinking_verbosity_reward/std": 0.2660253047943115, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 841.0859375, "epoch": 0.047619047619047616, "grad_norm": 0.012567983008921146, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 5792441.0, "rewards/KL_reward/mean": -0.001314467517659068, "rewards/KL_reward/std": 0.0024355482310056686, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": 0.0059778811410069466, "rewards/angle_reward/std": 0.7117019891738892, "rewards/thinking_verbosity_reward/mean": -1.4328210353851318, "rewards/thinking_verbosity_reward/std": 0.2181081622838974, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 853.53125, "epoch": 0.048752834467120185, "grad_norm": 0.014290316961705685, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 5934229.0, "rewards/KL_reward/mean": -0.0013754046522080898, "rewards/KL_reward/std": 0.002865551272407174, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/angle_reward/mean": -0.04012656584382057, "rewards/angle_reward/std": 0.7322371602058411, "rewards/thinking_verbosity_reward/mean": -1.4434272050857544, "rewards/thinking_verbosity_reward/std": 0.2195524126291275, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 812.3984375, "epoch": 0.049886621315192746, "grad_norm": 0.015406670048832893, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 6070624.0, "rewards/KL_reward/mean": -0.0012093674158677459, "rewards/KL_reward/std": 0.002830892102792859, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": 0.06462673842906952, "rewards/angle_reward/std": 0.7060084939002991, "rewards/thinking_verbosity_reward/mean": -1.4061496257781982, "rewards/thinking_verbosity_reward/std": 0.2272499054670334, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 769.0234375, "epoch": 0.05102040816326531, "grad_norm": 0.014261603355407715, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 6200923.0, "rewards/KL_reward/mean": -0.0015996790025383234, "rewards/KL_reward/std": 0.003018921473994851, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": -0.07584583759307861, "rewards/angle_reward/std": 0.7126216292381287, "rewards/thinking_verbosity_reward/mean": -1.3641209602355957, "rewards/thinking_verbosity_reward/std": 0.2442130744457245, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 805.4453125, "epoch": 0.05215419501133787, "grad_norm": 0.015710052102804184, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 6336236.0, "rewards/KL_reward/mean": -0.0011746239615604281, "rewards/KL_reward/std": 0.0022468946408480406, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": 0.0019333362579345703, "rewards/angle_reward/std": 0.7123419046401978, "rewards/thinking_verbosity_reward/mean": -1.3931634426116943, "rewards/thinking_verbosity_reward/std": 0.2658627927303314, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 747.8203125, "epoch": 0.05328798185941043, "grad_norm": 0.014637403190135956, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 6464125.0, "rewards/KL_reward/mean": -0.0015386963495984674, "rewards/KL_reward/std": 0.002992629073560238, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": 0.03532971814274788, "rewards/angle_reward/std": 0.7095845341682434, "rewards/thinking_verbosity_reward/mean": -1.3381673097610474, "rewards/thinking_verbosity_reward/std": 0.2773464620113373, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 749.6015625, "epoch": 0.05442176870748299, "grad_norm": 0.015380950644612312, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 6591602.0, "rewards/KL_reward/mean": -0.001205449691042304, "rewards/KL_reward/std": 0.002618036000058055, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.0005743652582168579, "rewards/angle_reward/std": 0.7198529243469238, "rewards/thinking_verbosity_reward/mean": -1.337047815322876, "rewards/thinking_verbosity_reward/std": 0.2906661331653595, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 831.671875, "epoch": 0.05555555555555555, "grad_norm": 0.013588406145572662, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 6730392.0, "rewards/KL_reward/mean": -0.0013649301836267114, "rewards/KL_reward/std": 0.0024287677370011806, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.1746762990951538, "rewards/angle_reward/mean": 0.024919526651501656, "rewards/angle_reward/std": 0.7198489308357239, "rewards/thinking_verbosity_reward/mean": -1.419439435005188, "rewards/thinking_verbosity_reward/std": 0.24967728555202484, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 867.8671875, "epoch": 0.05668934240362812, "grad_norm": 0.014480918645858765, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 6873375.0, "rewards/KL_reward/mean": -0.0010345801711082458, "rewards/KL_reward/std": 0.0020714737474918365, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": 0.09728822112083435, "rewards/angle_reward/std": 0.7140886187553406, "rewards/thinking_verbosity_reward/mean": -1.4425103664398193, "rewards/thinking_verbosity_reward/std": 0.2950771152973175, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 763.890625, "epoch": 0.05782312925170068, "grad_norm": 0.01474759727716446, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 7003217.0, "rewards/KL_reward/mean": -0.0014291137922555208, "rewards/KL_reward/std": 0.002928144298493862, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.062387678772211075, "rewards/angle_reward/std": 0.7102147936820984, "rewards/thinking_verbosity_reward/mean": -1.3571451902389526, "rewards/thinking_verbosity_reward/std": 0.25666871666908264, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 848.828125, "epoch": 0.05895691609977324, "grad_norm": 0.014926274307072163, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 7144267.0, "rewards/KL_reward/mean": -0.001141536864452064, "rewards/KL_reward/std": 0.002282851841300726, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": 0.015168175101280212, "rewards/angle_reward/std": 0.7204251885414124, "rewards/thinking_verbosity_reward/mean": -1.4233806133270264, "rewards/thinking_verbosity_reward/std": 0.30707427859306335, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 696.6875, "epoch": 0.060090702947845805, "grad_norm": 0.016644051298499107, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 7265363.0, "rewards/KL_reward/mean": -0.0015665598912164569, "rewards/KL_reward/std": 0.0025091448333114386, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": -0.06638424098491669, "rewards/angle_reward/std": 0.7038470506668091, "rewards/thinking_verbosity_reward/mean": -1.2934876680374146, "rewards/thinking_verbosity_reward/std": 0.25808796286582947, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 803.3046875, "epoch": 0.061224489795918366, "grad_norm": 0.014262320473790169, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 7400074.0, "rewards/KL_reward/mean": -0.0014873250620439649, "rewards/KL_reward/std": 0.003350914688780904, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.0006549600511789322, "rewards/angle_reward/std": 0.7145742774009705, "rewards/thinking_verbosity_reward/mean": -1.4003604650497437, "rewards/thinking_verbosity_reward/std": 0.21219314634799957, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 712.109375, "epoch": 0.06235827664399093, "grad_norm": 0.015680724754929543, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 7522512.0, "rewards/KL_reward/mean": -0.0015632398426532745, "rewards/KL_reward/std": 0.002867954084649682, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.057161569595336914, "rewards/angle_reward/std": 0.7097866535186768, "rewards/thinking_verbosity_reward/mean": -1.3076016902923584, "rewards/thinking_verbosity_reward/std": 0.26154884696006775, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 796.15625, "epoch": 0.06349206349206349, "grad_norm": 0.01348777674138546, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 7656388.0, "rewards/KL_reward/mean": -0.0012216382892802358, "rewards/KL_reward/std": 0.002764075295999646, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": -0.01048531848937273, "rewards/angle_reward/std": 0.7025184631347656, "rewards/thinking_verbosity_reward/mean": -1.3820040225982666, "rewards/thinking_verbosity_reward/std": 0.28005632758140564, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 791.1328125, "epoch": 0.06462585034013606, "grad_norm": 0.015808461233973503, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 7789629.0, "rewards/KL_reward/mean": -0.0014838757924735546, "rewards/KL_reward/std": 0.0027472227811813354, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/angle_reward/mean": 0.027304016053676605, "rewards/angle_reward/std": 0.6935069561004639, "rewards/thinking_verbosity_reward/mean": -1.3721787929534912, "rewards/thinking_verbosity_reward/std": 0.30532729625701904, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 655.4453125, "epoch": 0.06575963718820861, "grad_norm": 0.01579357124865055, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 7905598.0, "rewards/KL_reward/mean": -0.0015653329901397228, "rewards/KL_reward/std": 0.0028549018315970898, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.050490882247686386, "rewards/angle_reward/std": 0.7122427225112915, "rewards/thinking_verbosity_reward/mean": -1.2625588178634644, "rewards/thinking_verbosity_reward/std": 0.20583978295326233, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 764.5, "epoch": 0.06689342403628118, "grad_norm": 0.01618257351219654, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 8035030.0, "rewards/KL_reward/mean": -0.001291181892156601, "rewards/KL_reward/std": 0.0025379913859069347, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": 0.026311999186873436, "rewards/angle_reward/std": 0.7142142057418823, "rewards/thinking_verbosity_reward/mean": -1.3533778190612793, "rewards/thinking_verbosity_reward/std": 0.27879348397254944, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 773.453125, "epoch": 0.06802721088435375, "grad_norm": 0.013515808619558811, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 8166064.0, "rewards/KL_reward/mean": -0.0010551323648542166, "rewards/KL_reward/std": 0.002373181516304612, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.11679263412952423, "rewards/angle_reward/std": 0.7029911279678345, "rewards/thinking_verbosity_reward/mean": -1.3738677501678467, "rewards/thinking_verbosity_reward/std": 0.2095806747674942, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 909.96875, "epoch": 0.0691609977324263, "grad_norm": 0.013791844248771667, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 8314388.0, "rewards/KL_reward/mean": -0.0012126723304390907, "rewards/KL_reward/std": 0.002127976855263114, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.008181052282452583, "rewards/angle_reward/std": 0.7238855957984924, "rewards/thinking_verbosity_reward/mean": -1.4832502603530884, "rewards/thinking_verbosity_reward/std": 0.2702941596508026, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 604.140625, "epoch": 0.07029478458049887, "grad_norm": 0.020265107974410057, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 8423910.0, "rewards/KL_reward/mean": -0.0018225734820589423, "rewards/KL_reward/std": 0.0034152804873883724, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": 0.039339445531368256, "rewards/angle_reward/std": 0.713613748550415, "rewards/thinking_verbosity_reward/mean": -1.2084236145019531, "rewards/thinking_verbosity_reward/std": 0.218318372964859, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 781.0625, "epoch": 0.07142857142857142, "grad_norm": 0.014227538369596004, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 8556422.0, "rewards/KL_reward/mean": -0.0014920226531103253, "rewards/KL_reward/std": 0.0025782333686947823, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.04792693629860878, "rewards/angle_reward/std": 0.7073825001716614, "rewards/thinking_verbosity_reward/mean": -1.37550687789917, "rewards/thinking_verbosity_reward/std": 0.24198274314403534, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 828.125, "epoch": 0.07256235827664399, "grad_norm": 0.013525891117751598, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 8694350.0, "rewards/KL_reward/mean": -0.0012743088882416487, "rewards/KL_reward/std": 0.0024931752122938633, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/angle_reward/mean": -0.06039245426654816, "rewards/angle_reward/std": 0.7030278444290161, "rewards/thinking_verbosity_reward/mean": -1.4096813201904297, "rewards/thinking_verbosity_reward/std": 0.2851056754589081, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 859.4609375, "epoch": 0.07369614512471655, "grad_norm": 0.013323036022484303, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 8836177.0, "rewards/KL_reward/mean": -0.0013674057554453611, "rewards/KL_reward/std": 0.0025408368092030287, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.020052675157785416, "rewards/angle_reward/std": 0.7065397500991821, "rewards/thinking_verbosity_reward/mean": -1.4406501054763794, "rewards/thinking_verbosity_reward/std": 0.2669748067855835, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 820.265625, "epoch": 0.07482993197278912, "grad_norm": 0.011960196308791637, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 8972659.0, "rewards/KL_reward/mean": -0.0016648797318339348, "rewards/KL_reward/std": 0.0024159452877938747, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.13370326161384583, "rewards/angle_reward/std": 0.6665264368057251, "rewards/thinking_verbosity_reward/mean": -1.402920126914978, "rewards/thinking_verbosity_reward/std": 0.2838478088378906, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 735.5, "epoch": 0.07596371882086168, "grad_norm": 0.01711840182542801, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 9099099.0, "rewards/KL_reward/mean": -0.0008906584698706865, "rewards/KL_reward/std": 0.0025882436893880367, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.10576502978801727, "rewards/angle_reward/std": 0.7215561866760254, "rewards/thinking_verbosity_reward/mean": -1.328075647354126, "rewards/thinking_verbosity_reward/std": 0.27021440863609314, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 745.984375, "epoch": 0.07709750566893424, "grad_norm": 0.015710189938545227, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 9226129.0, "rewards/KL_reward/mean": -0.0013148458674550056, "rewards/KL_reward/std": 0.002292931778356433, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.4513758420944214, "rewards/angle_reward/mean": 0.0033968668431043625, "rewards/angle_reward/std": 0.6955055594444275, "rewards/thinking_verbosity_reward/mean": -1.3397393226623535, "rewards/thinking_verbosity_reward/std": 0.26094380021095276, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 731.171875, "epoch": 0.0782312925170068, "grad_norm": 0.013799438253045082, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 9351943.0, "rewards/KL_reward/mean": -0.0013583763502538204, "rewards/KL_reward/std": 0.0027255534660071135, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": 0.0322229377925396, "rewards/angle_reward/std": 0.7012441158294678, "rewards/thinking_verbosity_reward/mean": -1.3318828344345093, "rewards/thinking_verbosity_reward/std": 0.2277809977531433, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 816.46875, "epoch": 0.07936507936507936, "grad_norm": 0.014650252647697926, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 9488251.0, "rewards/KL_reward/mean": -0.0020846647676080465, "rewards/KL_reward/std": 0.002860372420400381, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.04663441330194473, "rewards/angle_reward/std": 0.7286649942398071, "rewards/thinking_verbosity_reward/mean": -1.4011547565460205, "rewards/thinking_verbosity_reward/std": 0.2757023870944977, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 691.546875, "epoch": 0.08049886621315193, "grad_norm": 0.017928482964634895, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 9608449.0, "rewards/KL_reward/mean": -0.001543578808195889, "rewards/KL_reward/std": 0.0029939457308501005, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.040303785353899, "rewards/angle_reward/std": 0.7216225862503052, "rewards/thinking_verbosity_reward/mean": -1.292057752609253, "rewards/thinking_verbosity_reward/std": 0.23946501314640045, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 774.8046875, "epoch": 0.08163265306122448, "grad_norm": 0.016091670840978622, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 9739752.0, "rewards/KL_reward/mean": -0.0016156400088220835, "rewards/KL_reward/std": 0.0028179564978927374, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.046681374311447144, "rewards/angle_reward/std": 0.7233835458755493, "rewards/thinking_verbosity_reward/mean": -1.364489197731018, "rewards/thinking_verbosity_reward/std": 0.2705446183681488, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 779.796875, "epoch": 0.08276643990929705, "grad_norm": 0.01477085892111063, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 9871790.0, "rewards/KL_reward/mean": -0.001758662285283208, "rewards/KL_reward/std": 0.002572139957919717, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": -0.06254503130912781, "rewards/angle_reward/std": 0.7012284398078918, "rewards/thinking_verbosity_reward/mean": -1.3785450458526611, "rewards/thinking_verbosity_reward/std": 0.21673206984996796, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 803.3984375, "epoch": 0.08390022675736962, "grad_norm": 0.013954821974039078, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 10006401.0, "rewards/KL_reward/mean": -0.002070910297334194, "rewards/KL_reward/std": 0.0029955198988318443, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/angle_reward/mean": 0.007975263521075249, "rewards/angle_reward/std": 0.719423234462738, "rewards/thinking_verbosity_reward/mean": -1.3989970684051514, "rewards/thinking_verbosity_reward/std": 0.22138263285160065, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 865.4375, "epoch": 0.08503401360544217, "grad_norm": 0.015697991475462914, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 10149401.0, "rewards/KL_reward/mean": -0.0014827789273113012, "rewards/KL_reward/std": 0.0024286627303808928, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": -0.031810443848371506, "rewards/angle_reward/std": 0.7595731616020203, "rewards/thinking_verbosity_reward/mean": -1.4320918321609497, "rewards/thinking_verbosity_reward/std": 0.33330804109573364, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 815.40625, "epoch": 0.08616780045351474, "grad_norm": 0.015063408762216568, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 10285349.0, "rewards/KL_reward/mean": -0.0011845446424558759, "rewards/KL_reward/std": 0.0025990239810198545, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22826264798641205, "rewards/angle_reward/mean": 0.014048881828784943, "rewards/angle_reward/std": 0.6989704370498657, "rewards/thinking_verbosity_reward/mean": -1.4028055667877197, "rewards/thinking_verbosity_reward/std": 0.2620105445384979, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 755.0546875, "epoch": 0.0873015873015873, "grad_norm": 0.014532950706779957, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 10414204.0, "rewards/KL_reward/mean": -0.00224322103895247, "rewards/KL_reward/std": 0.00291983550414443, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.050674110651016235, "rewards/angle_reward/std": 0.7175518274307251, "rewards/thinking_verbosity_reward/mean": -1.3487523794174194, "rewards/thinking_verbosity_reward/std": 0.257921040058136, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 814.984375, "epoch": 0.08843537414965986, "grad_norm": 0.016100991517305374, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 10550802.0, "rewards/KL_reward/mean": -0.0012918481370434165, "rewards/KL_reward/std": 0.002293823752552271, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.059970177710056305, "rewards/angle_reward/std": 0.695135235786438, "rewards/thinking_verbosity_reward/mean": -1.410853385925293, "rewards/thinking_verbosity_reward/std": 0.21162235736846924, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 719.9609375, "epoch": 0.08956916099773243, "grad_norm": 0.01754230074584484, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 10674333.0, "rewards/KL_reward/mean": -0.0015553045086562634, "rewards/KL_reward/std": 0.003376440843567252, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.0025256406515836716, "rewards/angle_reward/std": 0.7213703393936157, "rewards/thinking_verbosity_reward/mean": -1.316839575767517, "rewards/thinking_verbosity_reward/std": 0.2525773346424103, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 828.6328125, "epoch": 0.09070294784580499, "grad_norm": 0.014959538355469704, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 10812118.0, "rewards/KL_reward/mean": -0.0014992081560194492, "rewards/KL_reward/std": 0.0026648149359971285, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": 0.02252374216914177, "rewards/angle_reward/std": 0.7095814347267151, "rewards/thinking_verbosity_reward/mean": -1.3917865753173828, "rewards/thinking_verbosity_reward/std": 0.36484360694885254, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 787.171875, "epoch": 0.09183673469387756, "grad_norm": 0.013848821632564068, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 10944932.0, "rewards/KL_reward/mean": -0.0012344918213784695, "rewards/KL_reward/std": 0.0031588769052177668, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.43914902210235596, "rewards/angle_reward/mean": -0.036400556564331055, "rewards/angle_reward/std": 0.7025420069694519, "rewards/thinking_verbosity_reward/mean": -1.3798415660858154, "rewards/thinking_verbosity_reward/std": 0.24886053800582886, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 797.65625, "epoch": 0.09297052154195011, "grad_norm": 0.016953621059656143, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 11079456.0, "rewards/KL_reward/mean": -0.0018349254969507456, "rewards/KL_reward/std": 0.0028502477798610926, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.09390395134687424, "rewards/angle_reward/std": 0.7117813229560852, "rewards/thinking_verbosity_reward/mean": -1.3859490156173706, "rewards/thinking_verbosity_reward/std": 0.2669912278652191, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 712.2890625, "epoch": 0.09410430839002268, "grad_norm": 0.01498359628021717, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 11202453.0, "rewards/KL_reward/mean": -0.0020972711499780416, "rewards/KL_reward/std": 0.002872837008908391, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.01293040532618761, "rewards/angle_reward/std": 0.7263043522834778, "rewards/thinking_verbosity_reward/mean": -1.3143742084503174, "rewards/thinking_verbosity_reward/std": 0.22596809267997742, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 744.9765625, "epoch": 0.09523809523809523, "grad_norm": 0.014811373315751553, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 11329906.0, "rewards/KL_reward/mean": -0.0017401642398908734, "rewards/KL_reward/std": 0.0029196979012340307, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": 0.05458701029419899, "rewards/angle_reward/std": 0.7082171440124512, "rewards/thinking_verbosity_reward/mean": -1.3352049589157104, "rewards/thinking_verbosity_reward/std": 0.27892017364501953, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 806.9296875, "epoch": 0.0963718820861678, "grad_norm": 0.014128442853689194, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 11465633.0, "rewards/KL_reward/mean": -0.0013117834459990263, "rewards/KL_reward/std": 0.003301942953839898, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": -0.008452286012470722, "rewards/angle_reward/std": 0.713223934173584, "rewards/thinking_verbosity_reward/mean": -1.3895466327667236, "rewards/thinking_verbosity_reward/std": 0.29090580344200134, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 791.3125, "epoch": 0.09750566893424037, "grad_norm": 0.016451245173811913, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 11598353.0, "rewards/KL_reward/mean": -0.002211233600974083, "rewards/KL_reward/std": 0.003300619777292013, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.019138764590024948, "rewards/angle_reward/std": 0.6985461711883545, "rewards/thinking_verbosity_reward/mean": -1.368886947631836, "rewards/thinking_verbosity_reward/std": 0.32039639353752136, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 812.6875, "epoch": 0.09863945578231292, "grad_norm": 0.015922971069812775, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 11734393.0, "rewards/KL_reward/mean": -0.0019225740106776357, "rewards/KL_reward/std": 0.003200812265276909, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.15048927068710327, "rewards/angle_reward/std": 0.701505720615387, "rewards/thinking_verbosity_reward/mean": -1.4007548093795776, "rewards/thinking_verbosity_reward/std": 0.26006340980529785, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 756.71875, "epoch": 0.09977324263038549, "grad_norm": 0.017682049423456192, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 11863573.0, "rewards/KL_reward/mean": -0.0021198848262429237, "rewards/KL_reward/std": 0.0028244415298104286, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": 0.05446275323629379, "rewards/angle_reward/std": 0.7076202034950256, "rewards/thinking_verbosity_reward/mean": -1.344355583190918, "rewards/thinking_verbosity_reward/std": 0.2874905467033386, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 716.015625, "epoch": 0.10090702947845805, "grad_norm": 0.015480165369808674, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 11986591.0, "rewards/KL_reward/mean": -0.002201077062636614, "rewards/KL_reward/std": 0.0028800820000469685, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.10327331721782684, "rewards/angle_reward/std": 0.6940180063247681, "rewards/thinking_verbosity_reward/mean": -1.3169283866882324, "rewards/thinking_verbosity_reward/std": 0.2315988689661026, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 789.28125, "epoch": 0.10204081632653061, "grad_norm": 0.0141932163387537, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 12119795.0, "rewards/KL_reward/mean": -0.00178247201256454, "rewards/KL_reward/std": 0.002433969872072339, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.04347587376832962, "rewards/angle_reward/std": 0.7216427326202393, "rewards/thinking_verbosity_reward/mean": -1.3841197490692139, "rewards/thinking_verbosity_reward/std": 0.2351774126291275, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 691.609375, "epoch": 0.10317460317460317, "grad_norm": 0.01886655017733574, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 12240073.0, "rewards/KL_reward/mean": -0.002210551407188177, "rewards/KL_reward/std": 0.0031716807279735804, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.06363484263420105, "rewards/angle_reward/std": 0.6967713832855225, "rewards/thinking_verbosity_reward/mean": -1.2888851165771484, "rewards/thinking_verbosity_reward/std": 0.25650152564048767, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 731.5546875, "epoch": 0.10430839002267574, "grad_norm": 0.01625211536884308, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 12365576.0, "rewards/KL_reward/mean": -0.0018134694546461105, "rewards/KL_reward/std": 0.0029981709085404873, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.0358327217400074, "rewards/angle_reward/std": 0.6934579610824585, "rewards/thinking_verbosity_reward/mean": -1.3283206224441528, "rewards/thinking_verbosity_reward/std": 0.249838188290596, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 791.09375, "epoch": 0.1054421768707483, "grad_norm": 0.013530511409044266, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 12498508.0, "rewards/KL_reward/mean": -0.0023961542174220085, "rewards/KL_reward/std": 0.0037159009370952845, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.08895541727542877, "rewards/angle_reward/std": 0.7009344696998596, "rewards/thinking_verbosity_reward/mean": -1.3683531284332275, "rewards/thinking_verbosity_reward/std": 0.3219837248325348, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 803.390625, "epoch": 0.10657596371882086, "grad_norm": 0.012814110144972801, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 12633734.0, "rewards/KL_reward/mean": -0.0018958018627017736, "rewards/KL_reward/std": 0.0025667615700513124, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": 0.001182081177830696, "rewards/angle_reward/std": 0.6966385841369629, "rewards/thinking_verbosity_reward/mean": -1.3940694332122803, "rewards/thinking_verbosity_reward/std": 0.2510763704776764, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 775.1796875, "epoch": 0.10770975056689343, "grad_norm": 0.01644972898066044, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 12765061.0, "rewards/KL_reward/mean": -0.0021666029933840036, "rewards/KL_reward/std": 0.003533849259838462, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": 0.01855868473649025, "rewards/angle_reward/std": 0.692621648311615, "rewards/thinking_verbosity_reward/mean": -1.3660519123077393, "rewards/thinking_verbosity_reward/std": 0.2643062174320221, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 710.265625, "epoch": 0.10884353741496598, "grad_norm": 0.016994789242744446, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 12888039.0, "rewards/KL_reward/mean": -0.0026729642413556576, "rewards/KL_reward/std": 0.004195652902126312, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.0013782698661088943, "rewards/angle_reward/std": 0.7274826765060425, "rewards/thinking_verbosity_reward/mean": -1.2989205121994019, "rewards/thinking_verbosity_reward/std": 0.2943909466266632, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 813.5703125, "epoch": 0.10997732426303855, "grad_norm": 0.012327135540544987, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 13023496.0, "rewards/KL_reward/mean": -0.0015367217129096389, "rewards/KL_reward/std": 0.002783792093396187, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.03059232421219349, "rewards/angle_reward/std": 0.6990124583244324, "rewards/thinking_verbosity_reward/mean": -1.3972327709197998, "rewards/thinking_verbosity_reward/std": 0.2823668420314789, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 776.2265625, "epoch": 0.1111111111111111, "grad_norm": 0.015816690400242805, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 13154485.0, "rewards/KL_reward/mean": -0.001837437623180449, "rewards/KL_reward/std": 0.003085511038079858, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": -0.0798722580075264, "rewards/angle_reward/std": 0.7076959609985352, "rewards/thinking_verbosity_reward/mean": -1.359372854232788, "rewards/thinking_verbosity_reward/std": 0.30149850249290466, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 702.625, "epoch": 0.11224489795918367, "grad_norm": 0.017176752910017967, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 13276269.0, "rewards/KL_reward/mean": -0.002208163496106863, "rewards/KL_reward/std": 0.0035135303623974323, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": -0.028847502544522285, "rewards/angle_reward/std": 0.7246776819229126, "rewards/thinking_verbosity_reward/mean": -1.2869293689727783, "rewards/thinking_verbosity_reward/std": 0.3140481412410736, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 740.4609375, "epoch": 0.11337868480725624, "grad_norm": 0.016666896641254425, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 13403208.0, "rewards/KL_reward/mean": -0.0020394723396748304, "rewards/KL_reward/std": 0.003099593333899975, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": -0.04304052144289017, "rewards/angle_reward/std": 0.718730092048645, "rewards/thinking_verbosity_reward/mean": -1.3422832489013672, "rewards/thinking_verbosity_reward/std": 0.2173442840576172, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 673.9296875, "epoch": 0.1145124716553288, "grad_norm": 0.016901105642318726, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 13521735.0, "rewards/KL_reward/mean": -0.002501721028238535, "rewards/KL_reward/std": 0.0035446849651634693, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": 0.05400076508522034, "rewards/angle_reward/std": 0.7020770311355591, "rewards/thinking_verbosity_reward/mean": -1.2788972854614258, "rewards/thinking_verbosity_reward/std": 0.21710321307182312, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 720.796875, "epoch": 0.11564625850340136, "grad_norm": 0.01597214862704277, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 13646085.0, "rewards/KL_reward/mean": -0.0022737011313438416, "rewards/KL_reward/std": 0.0033756305929273367, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.07139294594526291, "rewards/angle_reward/std": 0.7032915949821472, "rewards/thinking_verbosity_reward/mean": -1.3200069665908813, "rewards/thinking_verbosity_reward/std": 0.2397989183664322, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 830.6875, "epoch": 0.11678004535147392, "grad_norm": 0.01694324240088463, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 13784285.0, "rewards/KL_reward/mean": -0.0016397619619965553, "rewards/KL_reward/std": 0.002778163179755211, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": -0.07758922874927521, "rewards/angle_reward/std": 0.7181770205497742, "rewards/thinking_verbosity_reward/mean": -1.4208343029022217, "rewards/thinking_verbosity_reward/std": 0.23640063405036926, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 741.453125, "epoch": 0.11791383219954649, "grad_norm": 0.016481440514326096, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 13910383.0, "rewards/KL_reward/mean": -0.0017886109417304397, "rewards/KL_reward/std": 0.0030333735048770905, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": 0.014164380729198456, "rewards/angle_reward/std": 0.7332705855369568, "rewards/thinking_verbosity_reward/mean": -1.341301679611206, "rewards/thinking_verbosity_reward/std": 0.22893868386745453, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 722.7890625, "epoch": 0.11904761904761904, "grad_norm": 0.015603181906044483, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 14035084.0, "rewards/KL_reward/mean": -0.0022284667938947678, "rewards/KL_reward/std": 0.0033084414899349213, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/angle_reward/mean": 0.010492192581295967, "rewards/angle_reward/std": 0.7046900987625122, "rewards/thinking_verbosity_reward/mean": -1.3199641704559326, "rewards/thinking_verbosity_reward/std": 0.25039249658584595, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 792.3984375, "epoch": 0.12018140589569161, "grad_norm": 0.012676805257797241, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 14167983.0, "rewards/KL_reward/mean": -0.002145292004570365, "rewards/KL_reward/std": 0.0035973675549030304, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.030615121126174927, "rewards/angle_reward/std": 0.6757726073265076, "rewards/thinking_verbosity_reward/mean": -1.3784376382827759, "rewards/thinking_verbosity_reward/std": 0.2810404598712921, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 782.265625, "epoch": 0.12131519274376418, "grad_norm": 0.018651586025953293, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 14299953.0, "rewards/KL_reward/mean": -0.0014412910677492619, "rewards/KL_reward/std": 0.0026455435436218977, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.17002493143081665, "rewards/angle_reward/std": 0.7069966793060303, "rewards/thinking_verbosity_reward/mean": -1.376008152961731, "rewards/thinking_verbosity_reward/std": 0.24543040990829468, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 748.109375, "epoch": 0.12244897959183673, "grad_norm": 0.0168473981320858, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 14427551.0, "rewards/KL_reward/mean": -0.001817174255847931, "rewards/KL_reward/std": 0.0029042731039226055, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": -0.09525615721940994, "rewards/angle_reward/std": 0.7056543231010437, "rewards/thinking_verbosity_reward/mean": -1.3519896268844604, "rewards/thinking_verbosity_reward/std": 0.20033209025859833, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 894.53125, "epoch": 0.1235827664399093, "grad_norm": 0.014377085492014885, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 14573499.0, "rewards/KL_reward/mean": -0.0022030770778656006, "rewards/KL_reward/std": 0.00294887856580317, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": 0.006665170192718506, "rewards/angle_reward/std": 0.7199759483337402, "rewards/thinking_verbosity_reward/mean": -1.4535714387893677, "rewards/thinking_verbosity_reward/std": 0.3491290807723999, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 662.0234375, "epoch": 0.12471655328798185, "grad_norm": 0.01902906969189644, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 14690518.0, "rewards/KL_reward/mean": -0.0027123568579554558, "rewards/KL_reward/std": 0.0032277333084493876, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.04106944054365158, "rewards/angle_reward/std": 0.7126646637916565, "rewards/thinking_verbosity_reward/mean": -1.2683844566345215, "rewards/thinking_verbosity_reward/std": 0.20977459847927094, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 832.5390625, "epoch": 0.12585034013605442, "grad_norm": 0.01605716161429882, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 14829395.0, "rewards/KL_reward/mean": -0.0022059683687984943, "rewards/KL_reward/std": 0.0037288935855031013, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.023672189563512802, "rewards/angle_reward/std": 0.7149680256843567, "rewards/thinking_verbosity_reward/mean": -1.4242630004882812, "rewards/thinking_verbosity_reward/std": 0.2249453365802765, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 676.484375, "epoch": 0.12698412698412698, "grad_norm": 0.017349785193800926, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 14947665.0, "rewards/KL_reward/mean": -0.003657208289951086, "rewards/KL_reward/std": 0.003820694051682949, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.020158935338258743, "rewards/angle_reward/std": 0.7064082622528076, "rewards/thinking_verbosity_reward/mean": -1.2656853199005127, "rewards/thinking_verbosity_reward/std": 0.29552730917930603, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 718.1640625, "epoch": 0.12811791383219956, "grad_norm": 0.016457414254546165, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 15070934.0, "rewards/KL_reward/mean": -0.002713849302381277, "rewards/KL_reward/std": 0.0035665060859173536, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.03854944556951523, "rewards/angle_reward/std": 0.7127638459205627, "rewards/thinking_verbosity_reward/mean": -1.309330940246582, "rewards/thinking_verbosity_reward/std": 0.28132155537605286, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 708.5, "epoch": 0.1292517006802721, "grad_norm": 0.014944375492632389, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 15193350.0, "rewards/KL_reward/mean": -0.0027939858846366405, "rewards/KL_reward/std": 0.003191334195435047, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": 0.061815641820430756, "rewards/angle_reward/std": 0.7114821672439575, "rewards/thinking_verbosity_reward/mean": -1.3131558895111084, "rewards/thinking_verbosity_reward/std": 0.21124057471752167, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 685.53125, "epoch": 0.13038548752834467, "grad_norm": 0.017111442983150482, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 15312882.0, "rewards/KL_reward/mean": -0.0031511278357356787, "rewards/KL_reward/std": 0.0033508387859910727, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.02089923806488514, "rewards/angle_reward/std": 0.6855220198631287, "rewards/thinking_verbosity_reward/mean": -1.2856812477111816, "rewards/thinking_verbosity_reward/std": 0.2425108700990677, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 742.6484375, "epoch": 0.13151927437641722, "grad_norm": 0.017757011577486992, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 15439445.0, "rewards/KL_reward/mean": -0.002031938638538122, "rewards/KL_reward/std": 0.003581864293664694, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.057499468326568604, "rewards/angle_reward/std": 0.7282629609107971, "rewards/thinking_verbosity_reward/mean": -1.3443570137023926, "rewards/thinking_verbosity_reward/std": 0.21719655394554138, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 662.171875, "epoch": 0.1326530612244898, "grad_norm": 0.01628716289997101, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 15556275.0, "rewards/KL_reward/mean": -0.0032515444327145815, "rewards/KL_reward/std": 0.00442019198089838, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/angle_reward/mean": -0.044349588453769684, "rewards/angle_reward/std": 0.7159713506698608, "rewards/thinking_verbosity_reward/mean": -1.2610831260681152, "rewards/thinking_verbosity_reward/std": 0.2511417865753174, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 731.6953125, "epoch": 0.13378684807256236, "grad_norm": 0.014140671119093895, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 15682652.0, "rewards/KL_reward/mean": -0.00220735976472497, "rewards/KL_reward/std": 0.0038968021981418133, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": 0.014626394957304, "rewards/angle_reward/std": 0.7016589641571045, "rewards/thinking_verbosity_reward/mean": -1.3368642330169678, "rewards/thinking_verbosity_reward/std": 0.19941078126430511, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 744.453125, "epoch": 0.1349206349206349, "grad_norm": 0.017708374187350273, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 15810318.0, "rewards/KL_reward/mean": -0.0032416447065770626, "rewards/KL_reward/std": 0.004031994380056858, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": 0.019819986075162888, "rewards/angle_reward/std": 0.7375350594520569, "rewards/thinking_verbosity_reward/mean": -1.3275351524353027, "rewards/thinking_verbosity_reward/std": 0.31135663390159607, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 739.1875, "epoch": 0.1360544217687075, "grad_norm": 0.015575222671031952, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 15936750.0, "rewards/KL_reward/mean": -0.0036213041748851538, "rewards/KL_reward/std": 0.00411019753664732, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.02828877791762352, "rewards/angle_reward/std": 0.723593533039093, "rewards/thinking_verbosity_reward/mean": -1.3370579481124878, "rewards/thinking_verbosity_reward/std": 0.24100078642368317, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 748.1484375, "epoch": 0.13718820861678005, "grad_norm": 0.0148308789357543, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 16064481.0, "rewards/KL_reward/mean": -0.0030179324094206095, "rewards/KL_reward/std": 0.004045294597744942, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.026180170476436615, "rewards/angle_reward/std": 0.6928780674934387, "rewards/thinking_verbosity_reward/mean": -1.3432364463806152, "rewards/thinking_verbosity_reward/std": 0.25303834676742554, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 706.546875, "epoch": 0.1383219954648526, "grad_norm": 0.0160621777176857, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 16187199.0, "rewards/KL_reward/mean": -0.0030728490091860294, "rewards/KL_reward/std": 0.0036757541820406914, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": 0.03859724476933479, "rewards/angle_reward/std": 0.7228646874427795, "rewards/thinking_verbosity_reward/mean": -1.3010919094085693, "rewards/thinking_verbosity_reward/std": 0.2672620415687561, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 731.1796875, "epoch": 0.13945578231292516, "grad_norm": 0.015178170055150986, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 16312910.0, "rewards/KL_reward/mean": -0.0029267228674143553, "rewards/KL_reward/std": 0.003854723647236824, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/angle_reward/mean": 0.009864788502454758, "rewards/angle_reward/std": 0.7046127319335938, "rewards/thinking_verbosity_reward/mean": -1.335146188735962, "rewards/thinking_verbosity_reward/std": 0.20772947371006012, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 719.2734375, "epoch": 0.14058956916099774, "grad_norm": 0.017491133883595467, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 16436281.0, "rewards/KL_reward/mean": -0.0029566772282123566, "rewards/KL_reward/std": 0.003907571081072092, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.021206554025411606, "rewards/angle_reward/std": 0.7237843871116638, "rewards/thinking_verbosity_reward/mean": -1.3069604635238647, "rewards/thinking_verbosity_reward/std": 0.29699769616127014, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 771.0078125, "epoch": 0.1417233560090703, "grad_norm": 0.014866192825138569, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 16567234.0, "rewards/KL_reward/mean": -0.0039398204535245895, "rewards/KL_reward/std": 0.004764980636537075, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.06120563670992851, "rewards/angle_reward/std": 0.6730445027351379, "rewards/thinking_verbosity_reward/mean": -1.3563966751098633, "rewards/thinking_verbosity_reward/std": 0.2928489148616791, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 579.8359375, "epoch": 0.14285714285714285, "grad_norm": 0.019122039899230003, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 16673613.0, "rewards/KL_reward/mean": -0.0043771034106612206, "rewards/KL_reward/std": 0.004734088201075792, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.03073747828602791, "rewards/angle_reward/std": 0.7280511856079102, "rewards/thinking_verbosity_reward/mean": -1.182072639465332, "rewards/thinking_verbosity_reward/std": 0.2240225225687027, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 734.953125, "epoch": 0.14399092970521543, "grad_norm": 0.017316479235887527, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 16799879.0, "rewards/KL_reward/mean": -0.0034142176155000925, "rewards/KL_reward/std": 0.004490741994231939, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.04686341434717178, "rewards/angle_reward/std": 0.7065085172653198, "rewards/thinking_verbosity_reward/mean": -1.333661437034607, "rewards/thinking_verbosity_reward/std": 0.2379365712404251, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 695.78125, "epoch": 0.14512471655328799, "grad_norm": 0.016514454036951065, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 16921403.0, "rewards/KL_reward/mean": -0.0031645637936890125, "rewards/KL_reward/std": 0.0041256980039179325, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.014330117031931877, "rewards/angle_reward/std": 0.7244763374328613, "rewards/thinking_verbosity_reward/mean": -1.2905654907226562, "rewards/thinking_verbosity_reward/std": 0.26780593395233154, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 656.6953125, "epoch": 0.14625850340136054, "grad_norm": 0.01521074865013361, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 17037444.0, "rewards/KL_reward/mean": -0.003692830679938197, "rewards/KL_reward/std": 0.004924299195408821, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.009508412331342697, "rewards/angle_reward/std": 0.6744556427001953, "rewards/thinking_verbosity_reward/mean": -1.2567415237426758, "rewards/thinking_verbosity_reward/std": 0.2456611543893814, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 690.2578125, "epoch": 0.1473922902494331, "grad_norm": 0.01670273207128048, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 17157309.0, "rewards/KL_reward/mean": -0.0034066352527588606, "rewards/KL_reward/std": 0.0034871476236730814, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.009749175980687141, "rewards/angle_reward/std": 0.736328661441803, "rewards/thinking_verbosity_reward/mean": -1.2978801727294922, "rewards/thinking_verbosity_reward/std": 0.19752177596092224, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 701.7890625, "epoch": 0.14852607709750568, "grad_norm": 0.01933622732758522, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 17279162.0, "rewards/KL_reward/mean": -0.00316535378806293, "rewards/KL_reward/std": 0.004378271289169788, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": 0.03546326234936714, "rewards/angle_reward/std": 0.7148266434669495, "rewards/thinking_verbosity_reward/mean": -1.2958853244781494, "rewards/thinking_verbosity_reward/std": 0.2705304026603699, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 757.8046875, "epoch": 0.14965986394557823, "grad_norm": 0.0137477433308959, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 17408545.0, "rewards/KL_reward/mean": -0.003397803520783782, "rewards/KL_reward/std": 0.003958834335207939, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.01723206788301468, "rewards/angle_reward/std": 0.7112488746643066, "rewards/thinking_verbosity_reward/mean": -1.3415770530700684, "rewards/thinking_verbosity_reward/std": 0.3047771751880646, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 637.734375, "epoch": 0.15079365079365079, "grad_norm": 0.019227879121899605, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 17521855.0, "rewards/KL_reward/mean": -0.003851154586300254, "rewards/KL_reward/std": 0.005098348017781973, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.04486759379506111, "rewards/angle_reward/std": 0.7175752520561218, "rewards/thinking_verbosity_reward/mean": -1.2439886331558228, "rewards/thinking_verbosity_reward/std": 0.21095074713230133, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 698.1953125, "epoch": 0.15192743764172337, "grad_norm": 0.017246374860405922, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 17642864.0, "rewards/KL_reward/mean": -0.0029999513644725084, "rewards/KL_reward/std": 0.003456530626863241, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.05042729899287224, "rewards/angle_reward/std": 0.7183257937431335, "rewards/thinking_verbosity_reward/mean": -1.285423994064331, "rewards/thinking_verbosity_reward/std": 0.3023035526275635, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 624.7890625, "epoch": 0.15306122448979592, "grad_norm": 0.015937460586428642, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 17754965.0, "rewards/KL_reward/mean": -0.0046004485338926315, "rewards/KL_reward/std": 0.003918324131518602, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.03442571312189102, "rewards/angle_reward/std": 0.7067039608955383, "rewards/thinking_verbosity_reward/mean": -1.225354790687561, "rewards/thinking_verbosity_reward/std": 0.24156977236270905, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 755.875, "epoch": 0.15419501133786848, "grad_norm": 0.016583018004894257, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 17884061.0, "rewards/KL_reward/mean": -0.0034096338786184788, "rewards/KL_reward/std": 0.004352725576609373, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.4513758420944214, "rewards/angle_reward/mean": 0.048420269042253494, "rewards/angle_reward/std": 0.7189984917640686, "rewards/thinking_verbosity_reward/mean": -1.3360917568206787, "rewards/thinking_verbosity_reward/std": 0.3205486834049225, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 751.96875, "epoch": 0.15532879818594103, "grad_norm": 0.015360284596681595, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 18012057.0, "rewards/KL_reward/mean": -0.003828501794487238, "rewards/KL_reward/std": 0.004502009600400925, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.020650874823331833, "rewards/angle_reward/std": 0.7309556007385254, "rewards/thinking_verbosity_reward/mean": -1.337904453277588, "rewards/thinking_verbosity_reward/std": 0.2968203127384186, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 721.5234375, "epoch": 0.1564625850340136, "grad_norm": 0.01785186119377613, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 18136252.0, "rewards/KL_reward/mean": -0.0030521510634571314, "rewards/KL_reward/std": 0.004306137096136808, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.02232043817639351, "rewards/angle_reward/std": 0.7087429761886597, "rewards/thinking_verbosity_reward/mean": -1.3114829063415527, "rewards/thinking_verbosity_reward/std": 0.28632381558418274, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 666.9296875, "epoch": 0.15759637188208617, "grad_norm": 0.018005847930908203, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 18253931.0, "rewards/KL_reward/mean": -0.003561299294233322, "rewards/KL_reward/std": 0.004450182896107435, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.06507096439599991, "rewards/angle_reward/std": 0.6975307464599609, "rewards/thinking_verbosity_reward/mean": -1.2706272602081299, "rewards/thinking_verbosity_reward/std": 0.22487682104110718, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 700.703125, "epoch": 0.15873015873015872, "grad_norm": 0.017706632614135742, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 18375693.0, "rewards/KL_reward/mean": -0.0038045665714889765, "rewards/KL_reward/std": 0.005178486928343773, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.004316430538892746, "rewards/angle_reward/std": 0.7525703310966492, "rewards/thinking_verbosity_reward/mean": -1.2824617624282837, "rewards/thinking_verbosity_reward/std": 0.32464227080345154, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 741.3125, "epoch": 0.1598639455782313, "grad_norm": 0.015213903039693832, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 18502533.0, "rewards/KL_reward/mean": -0.0034087002277374268, "rewards/KL_reward/std": 0.003798006335273385, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": -0.1237906813621521, "rewards/angle_reward/std": 0.7100720405578613, "rewards/thinking_verbosity_reward/mean": -1.3441178798675537, "rewards/thinking_verbosity_reward/std": 0.2108469009399414, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 660.578125, "epoch": 0.16099773242630386, "grad_norm": 0.017703410238027573, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 18619055.0, "rewards/KL_reward/mean": -0.004657331854104996, "rewards/KL_reward/std": 0.005449049174785614, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": 0.0407666377723217, "rewards/angle_reward/std": 0.6839425563812256, "rewards/thinking_verbosity_reward/mean": -1.242754578590393, "rewards/thinking_verbosity_reward/std": 0.3245146870613098, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 637.2421875, "epoch": 0.1621315192743764, "grad_norm": 0.015972137451171875, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 18732246.0, "rewards/KL_reward/mean": -0.004260205198079348, "rewards/KL_reward/std": 0.004415446892380714, "rewards/accuracy_reward/mean": 0.359375, "rewards/accuracy_reward/std": 0.481702595949173, "rewards/angle_reward/mean": 0.1555885374546051, "rewards/angle_reward/std": 0.7082699537277222, "rewards/thinking_verbosity_reward/mean": -1.2461501359939575, "rewards/thinking_verbosity_reward/std": 0.19471445679664612, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 783.53125, "epoch": 0.16326530612244897, "grad_norm": 0.015578354708850384, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 18864938.0, "rewards/KL_reward/mean": -0.004091382492333651, "rewards/KL_reward/std": 0.004373244475573301, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.02433072030544281, "rewards/angle_reward/std": 0.7120692133903503, "rewards/thinking_verbosity_reward/mean": -1.3740453720092773, "rewards/thinking_verbosity_reward/std": 0.26230984926223755, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 668.21875, "epoch": 0.16439909297052155, "grad_norm": 0.017579572275280952, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 18982822.0, "rewards/KL_reward/mean": -0.003975247032940388, "rewards/KL_reward/std": 0.004776740446686745, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.09519442915916443, "rewards/angle_reward/std": 0.7148767113685608, "rewards/thinking_verbosity_reward/mean": -1.2664110660552979, "rewards/thinking_verbosity_reward/std": 0.25450992584228516, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 661.7890625, "epoch": 0.1655328798185941, "grad_norm": 0.01608235016465187, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 19100075.0, "rewards/KL_reward/mean": -0.005266926251351833, "rewards/KL_reward/std": 0.006609444040805101, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.1746762990951538, "rewards/angle_reward/mean": -0.028426187112927437, "rewards/angle_reward/std": 0.697603702545166, "rewards/thinking_verbosity_reward/mean": -1.2481904029846191, "rewards/thinking_verbosity_reward/std": 0.30778783559799194, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 742.0625, "epoch": 0.16666666666666666, "grad_norm": 0.01791374199092388, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 19227435.0, "rewards/KL_reward/mean": -0.003503902815282345, "rewards/KL_reward/std": 0.003946464508771896, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.04175577685236931, "rewards/angle_reward/std": 0.7005606293678284, "rewards/thinking_verbosity_reward/mean": -1.3384881019592285, "rewards/thinking_verbosity_reward/std": 0.2482069581747055, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 716.9453125, "epoch": 0.16780045351473924, "grad_norm": 0.014346621930599213, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 19351300.0, "rewards/KL_reward/mean": -0.003948048688471317, "rewards/KL_reward/std": 0.004674157593399286, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": 0.03262189030647278, "rewards/angle_reward/std": 0.6847970485687256, "rewards/thinking_verbosity_reward/mean": -1.2974423170089722, "rewards/thinking_verbosity_reward/std": 0.3276059031486511, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 691.6875, "epoch": 0.1689342403628118, "grad_norm": 0.018055735155940056, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 19471884.0, "rewards/KL_reward/mean": -0.0039238715544342995, "rewards/KL_reward/std": 0.005335812456905842, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.05350875109434128, "rewards/angle_reward/std": 0.701145350933075, "rewards/thinking_verbosity_reward/mean": -1.2935394048690796, "rewards/thinking_verbosity_reward/std": 0.23215413093566895, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 685.421875, "epoch": 0.17006802721088435, "grad_norm": 0.013902511447668076, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 19591802.0, "rewards/KL_reward/mean": -0.003565979190170765, "rewards/KL_reward/std": 0.004245271440595388, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.06707803905010223, "rewards/angle_reward/std": 0.6980970501899719, "rewards/thinking_verbosity_reward/mean": -1.2839555740356445, "rewards/thinking_verbosity_reward/std": 0.25100815296173096, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 605.4609375, "epoch": 0.1712018140589569, "grad_norm": 0.018808528780937195, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 19701733.0, "rewards/KL_reward/mean": -0.003445668611675501, "rewards/KL_reward/std": 0.004941299092024565, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.0014398042112588882, "rewards/angle_reward/std": 0.7135060429573059, "rewards/thinking_verbosity_reward/mean": -1.2097773551940918, "rewards/thinking_verbosity_reward/std": 0.21896615624427795, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 702.46875, "epoch": 0.17233560090702948, "grad_norm": 0.017743032425642014, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 19822961.0, "rewards/KL_reward/mean": -0.0039419992826879025, "rewards/KL_reward/std": 0.005640941672027111, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.006016073748469353, "rewards/angle_reward/std": 0.7430128455162048, "rewards/thinking_verbosity_reward/mean": -1.291778802871704, "rewards/thinking_verbosity_reward/std": 0.2926243841648102, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 710.6796875, "epoch": 0.17346938775510204, "grad_norm": 0.017250889912247658, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 19945792.0, "rewards/KL_reward/mean": -0.004069920629262924, "rewards/KL_reward/std": 0.004257708787918091, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": -0.1506166160106659, "rewards/angle_reward/std": 0.7420839667320251, "rewards/thinking_verbosity_reward/mean": -1.308785319328308, "rewards/thinking_verbosity_reward/std": 0.24828357994556427, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 676.5234375, "epoch": 0.1746031746031746, "grad_norm": 0.017633303999900818, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 20064731.0, "rewards/KL_reward/mean": -0.004391741007566452, "rewards/KL_reward/std": 0.005697107408195734, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.04565593600273132, "rewards/angle_reward/std": 0.7310495376586914, "rewards/thinking_verbosity_reward/mean": -1.278063416481018, "rewards/thinking_verbosity_reward/std": 0.23613341152668, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 623.9140625, "epoch": 0.17573696145124718, "grad_norm": 0.019788647070527077, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 20176464.0, "rewards/KL_reward/mean": -0.0035868592094630003, "rewards/KL_reward/std": 0.004520603455603123, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": -0.09205663204193115, "rewards/angle_reward/std": 0.7439991235733032, "rewards/thinking_verbosity_reward/mean": -1.2228164672851562, "rewards/thinking_verbosity_reward/std": 0.24998639523983002, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 676.6640625, "epoch": 0.17687074829931973, "grad_norm": 0.022692374885082245, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 20294669.0, "rewards/KL_reward/mean": -0.005349142476916313, "rewards/KL_reward/std": 0.006089122965931892, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.4513758420944214, "rewards/angle_reward/mean": 0.08410148322582245, "rewards/angle_reward/std": 0.7432141304016113, "rewards/thinking_verbosity_reward/mean": -1.2565858364105225, "rewards/thinking_verbosity_reward/std": 0.3331960439682007, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 682.1015625, "epoch": 0.17800453514739228, "grad_norm": 0.017600808292627335, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 20414010.0, "rewards/KL_reward/mean": -0.004695170558989048, "rewards/KL_reward/std": 0.004911729600280523, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": 0.012458901852369308, "rewards/angle_reward/std": 0.7319358587265015, "rewards/thinking_verbosity_reward/mean": -1.2779560089111328, "rewards/thinking_verbosity_reward/std": 0.26474660634994507, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 633.6875, "epoch": 0.17913832199546487, "grad_norm": 0.016389839351177216, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 20527138.0, "rewards/KL_reward/mean": -0.004976123105734587, "rewards/KL_reward/std": 0.005367509555071592, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.016020849347114563, "rewards/angle_reward/std": 0.7215960025787354, "rewards/thinking_verbosity_reward/mean": -1.2339168787002563, "rewards/thinking_verbosity_reward/std": 0.24415971338748932, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 627.59375, "epoch": 0.18027210884353742, "grad_norm": 0.018503598868846893, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 20639710.0, "rewards/KL_reward/mean": -0.004522197414189577, "rewards/KL_reward/std": 0.0054156603291630745, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.027919773012399673, "rewards/angle_reward/std": 0.6923909783363342, "rewards/thinking_verbosity_reward/mean": -1.2318589687347412, "rewards/thinking_verbosity_reward/std": 0.22215832769870758, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 624.3046875, "epoch": 0.18140589569160998, "grad_norm": 0.019150318577885628, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 20751677.0, "rewards/KL_reward/mean": -0.005446942523121834, "rewards/KL_reward/std": 0.005401700735092163, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": -0.03322335332632065, "rewards/angle_reward/std": 0.7266063094139099, "rewards/thinking_verbosity_reward/mean": -1.225290298461914, "rewards/thinking_verbosity_reward/std": 0.23961004614830017, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 699.578125, "epoch": 0.18253968253968253, "grad_norm": 0.016021663323044777, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 20873047.0, "rewards/KL_reward/mean": -0.003972027916461229, "rewards/KL_reward/std": 0.00455878023058176, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44340085983276367, "rewards/angle_reward/mean": 0.03642219305038452, "rewards/angle_reward/std": 0.7052026391029358, "rewards/thinking_verbosity_reward/mean": -1.2939989566802979, "rewards/thinking_verbosity_reward/std": 0.2693682610988617, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 755.7421875, "epoch": 0.1836734693877551, "grad_norm": 0.015306858345866203, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 21001646.0, "rewards/KL_reward/mean": -0.004739418625831604, "rewards/KL_reward/std": 0.004921747837215662, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.029500054195523262, "rewards/angle_reward/std": 0.7210052609443665, "rewards/thinking_verbosity_reward/mean": -1.3510175943374634, "rewards/thinking_verbosity_reward/std": 0.249182790517807, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 628.515625, "epoch": 0.18480725623582767, "grad_norm": 0.01611776277422905, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 21114136.0, "rewards/KL_reward/mean": -0.005245381500571966, "rewards/KL_reward/std": 0.006099026184529066, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.05297985300421715, "rewards/angle_reward/std": 0.66853266954422, "rewards/thinking_verbosity_reward/mean": -1.2296866178512573, "rewards/thinking_verbosity_reward/std": 0.23880408704280853, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 752.7109375, "epoch": 0.18594104308390022, "grad_norm": 0.0170980766415596, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 21242723.0, "rewards/KL_reward/mean": -0.004587255418300629, "rewards/KL_reward/std": 0.005099593196064234, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": 0.020532388240098953, "rewards/angle_reward/std": 0.7177428603172302, "rewards/thinking_verbosity_reward/mean": -1.3383744955062866, "rewards/thinking_verbosity_reward/std": 0.29780030250549316, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 668.3125, "epoch": 0.1870748299319728, "grad_norm": 0.018160531297326088, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 21360507.0, "rewards/KL_reward/mean": -0.00526096997782588, "rewards/KL_reward/std": 0.004359393380582333, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.03587224334478378, "rewards/angle_reward/std": 0.6858236789703369, "rewards/thinking_verbosity_reward/mean": -1.2683125734329224, "rewards/thinking_verbosity_reward/std": 0.24522313475608826, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 716.9375, "epoch": 0.18820861678004536, "grad_norm": 0.017265858128666878, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 21484555.0, "rewards/KL_reward/mean": -0.006219521164894104, "rewards/KL_reward/std": 0.008716563694179058, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.043499838560819626, "rewards/angle_reward/std": 0.7223663330078125, "rewards/thinking_verbosity_reward/mean": -1.2908995151519775, "rewards/thinking_verbosity_reward/std": 0.3525276184082031, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 781.984375, "epoch": 0.1893424036281179, "grad_norm": 0.01280480157583952, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 21616745.0, "rewards/KL_reward/mean": -0.005252651404589415, "rewards/KL_reward/std": 0.004878875333815813, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.02107461914420128, "rewards/angle_reward/std": 0.6955547332763672, "rewards/thinking_verbosity_reward/mean": -1.3725743293762207, "rewards/thinking_verbosity_reward/std": 0.2627544105052948, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 649.390625, "epoch": 0.19047619047619047, "grad_norm": 0.01947556994855404, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 21732139.0, "rewards/KL_reward/mean": -0.0048250844702124596, "rewards/KL_reward/std": 0.004948804154992104, "rewards/accuracy_reward/mean": 0.3828125, "rewards/accuracy_reward/std": 0.4879830479621887, "rewards/angle_reward/mean": 0.02219543047249317, "rewards/angle_reward/std": 0.6966620087623596, "rewards/thinking_verbosity_reward/mean": -1.2455801963806152, "rewards/thinking_verbosity_reward/std": 0.26460281014442444, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 709.6171875, "epoch": 0.19160997732426305, "grad_norm": 0.017735688015818596, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 21855034.0, "rewards/KL_reward/mean": -0.00399676663801074, "rewards/KL_reward/std": 0.003837845753878355, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.06983543187379837, "rewards/angle_reward/std": 0.7073096632957458, "rewards/thinking_verbosity_reward/mean": -1.3072407245635986, "rewards/thinking_verbosity_reward/std": 0.2513129711151123, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 671.9921875, "epoch": 0.1927437641723356, "grad_norm": 0.016666453331708908, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 21973249.0, "rewards/KL_reward/mean": -0.0056351423263549805, "rewards/KL_reward/std": 0.006115090101957321, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.025297701358795166, "rewards/angle_reward/std": 0.7012860774993896, "rewards/thinking_verbosity_reward/mean": -1.277807593345642, "rewards/thinking_verbosity_reward/std": 0.21199442446231842, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 709.609375, "epoch": 0.19387755102040816, "grad_norm": 0.017414640635252, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 22096039.0, "rewards/KL_reward/mean": -0.005263416562229395, "rewards/KL_reward/std": 0.005779444705694914, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": -0.02822297252714634, "rewards/angle_reward/std": 0.7202067971229553, "rewards/thinking_verbosity_reward/mean": -1.306566834449768, "rewards/thinking_verbosity_reward/std": 0.25482043623924255, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 813.4609375, "epoch": 0.19501133786848074, "grad_norm": 0.016978643834590912, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 22232370.0, "rewards/KL_reward/mean": -0.0045533087104558945, "rewards/KL_reward/std": 0.004777958616614342, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.09315788745880127, "rewards/angle_reward/std": 0.7188766598701477, "rewards/thinking_verbosity_reward/mean": -1.3977937698364258, "rewards/thinking_verbosity_reward/std": 0.2791663706302643, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 627.953125, "epoch": 0.1961451247165533, "grad_norm": 0.01868283748626709, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 22345340.0, "rewards/KL_reward/mean": -0.005546521861106157, "rewards/KL_reward/std": 0.005959734786301851, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.02303946577012539, "rewards/angle_reward/std": 0.7035124897956848, "rewards/thinking_verbosity_reward/mean": -1.232616662979126, "rewards/thinking_verbosity_reward/std": 0.2200835943222046, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 660.578125, "epoch": 0.19727891156462585, "grad_norm": 0.01470309216529131, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 22461734.0, "rewards/KL_reward/mean": -0.005267022177577019, "rewards/KL_reward/std": 0.005062570795416832, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.0618949793279171, "rewards/angle_reward/std": 0.6813735365867615, "rewards/thinking_verbosity_reward/mean": -1.2643492221832275, "rewards/thinking_verbosity_reward/std": 0.2252192497253418, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 633.5546875, "epoch": 0.1984126984126984, "grad_norm": 0.021716872230172157, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 22574661.0, "rewards/KL_reward/mean": -0.006059582345187664, "rewards/KL_reward/std": 0.007729161065071821, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.016686338931322098, "rewards/angle_reward/std": 0.7589618563652039, "rewards/thinking_verbosity_reward/mean": -1.2372753620147705, "rewards/thinking_verbosity_reward/std": 0.2257297784090042, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 672.90625, "epoch": 0.19954648526077098, "grad_norm": 0.016434574499726295, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 22692505.0, "rewards/KL_reward/mean": -0.006249883212149143, "rewards/KL_reward/std": 0.00694391829892993, "rewards/accuracy_reward/mean": 0.2578125, "rewards/accuracy_reward/std": 0.43914902210235596, "rewards/angle_reward/mean": 0.04599640890955925, "rewards/angle_reward/std": 0.6874366998672485, "rewards/thinking_verbosity_reward/mean": -1.2711100578308105, "rewards/thinking_verbosity_reward/std": 0.25402602553367615, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 714.984375, "epoch": 0.20068027210884354, "grad_norm": 0.016488516703248024, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 22815759.0, "rewards/KL_reward/mean": -0.0067267632111907005, "rewards/KL_reward/std": 0.006903097033500671, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": 0.03039107471704483, "rewards/angle_reward/std": 0.7145715951919556, "rewards/thinking_verbosity_reward/mean": -1.3076138496398926, "rewards/thinking_verbosity_reward/std": 0.2750972509384155, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 678.3046875, "epoch": 0.2018140589569161, "grad_norm": 0.018575025722384453, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 22934798.0, "rewards/KL_reward/mean": -0.005837662611156702, "rewards/KL_reward/std": 0.005946184508502483, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.0672629326581955, "rewards/angle_reward/std": 0.71257084608078, "rewards/thinking_verbosity_reward/mean": -1.2798986434936523, "rewards/thinking_verbosity_reward/std": 0.23570102453231812, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 694.5703125, "epoch": 0.20294784580498867, "grad_norm": 0.016843745484948158, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 23055239.0, "rewards/KL_reward/mean": -0.005394851788878441, "rewards/KL_reward/std": 0.005620879586786032, "rewards/accuracy_reward/mean": 0.3203125, "rewards/accuracy_reward/std": 0.4684300124645233, "rewards/angle_reward/mean": -0.0011367611587047577, "rewards/angle_reward/std": 0.7321102619171143, "rewards/thinking_verbosity_reward/mean": -1.2880403995513916, "rewards/thinking_verbosity_reward/std": 0.2747134864330292, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 594.6640625, "epoch": 0.20408163265306123, "grad_norm": 0.018888045102357864, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 23162676.0, "rewards/KL_reward/mean": -0.006592948921024799, "rewards/KL_reward/std": 0.0066827163100242615, "rewards/accuracy_reward/mean": 0.3515625, "rewards/accuracy_reward/std": 0.4793342351913452, "rewards/angle_reward/mean": 0.06055489182472229, "rewards/angle_reward/std": 0.7142935395240784, "rewards/thinking_verbosity_reward/mean": -1.1961841583251953, "rewards/thinking_verbosity_reward/std": 0.23169764876365662, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 712.2578125, "epoch": 0.20521541950113378, "grad_norm": 0.01747574843466282, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 23286325.0, "rewards/KL_reward/mean": -0.006673584692180157, "rewards/KL_reward/std": 0.006165490951389074, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": 0.03159511089324951, "rewards/angle_reward/std": 0.6844731569290161, "rewards/thinking_verbosity_reward/mean": -1.304220199584961, "rewards/thinking_verbosity_reward/std": 0.2787870168685913, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 577.1796875, "epoch": 0.20634920634920634, "grad_norm": 0.02173023857176304, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 23392516.0, "rewards/KL_reward/mean": -0.007380689959973097, "rewards/KL_reward/std": 0.005968100391328335, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": 0.017968900501728058, "rewards/angle_reward/std": 0.7066680192947388, "rewards/thinking_verbosity_reward/mean": -1.1748523712158203, "rewards/thinking_verbosity_reward/std": 0.24626189470291138, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 620.234375, "epoch": 0.20748299319727892, "grad_norm": 0.015387197025120258, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 23503202.0, "rewards/KL_reward/mean": -0.004902666434645653, "rewards/KL_reward/std": 0.005735761020332575, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": 0.07241170853376389, "rewards/angle_reward/std": 0.6873176693916321, "rewards/thinking_verbosity_reward/mean": -1.226454496383667, "rewards/thinking_verbosity_reward/std": 0.21028883755207062, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 694.9609375, "epoch": 0.20861678004535147, "grad_norm": 0.01568688452243805, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 23623917.0, "rewards/KL_reward/mean": -0.005722587462514639, "rewards/KL_reward/std": 0.006160435266792774, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": 0.00042659416794776917, "rewards/angle_reward/std": 0.709008514881134, "rewards/thinking_verbosity_reward/mean": -1.2791794538497925, "rewards/thinking_verbosity_reward/std": 0.3151502311229706, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 724.484375, "epoch": 0.20975056689342403, "grad_norm": 0.019990328699350357, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 23748843.0, "rewards/KL_reward/mean": -0.005906912498176098, "rewards/KL_reward/std": 0.005887902807444334, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.06703461706638336, "rewards/angle_reward/std": 0.7245620489120483, "rewards/thinking_verbosity_reward/mean": -1.2944716215133667, "rewards/thinking_verbosity_reward/std": 0.3659032881259918, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 657.5703125, "epoch": 0.2108843537414966, "grad_norm": 0.016637342050671577, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 23864540.0, "rewards/KL_reward/mean": -0.006674261763691902, "rewards/KL_reward/std": 0.006491075269877911, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.02003803290426731, "rewards/angle_reward/std": 0.7134320735931396, "rewards/thinking_verbosity_reward/mean": -1.2597732543945312, "rewards/thinking_verbosity_reward/std": 0.23402182757854462, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 673.4453125, "epoch": 0.21201814058956917, "grad_norm": 0.01620086468756199, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 23982269.0, "rewards/KL_reward/mean": -0.007163557223975658, "rewards/KL_reward/std": 0.006470571272075176, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.12528406083583832, "rewards/angle_reward/std": 0.7103714942932129, "rewards/thinking_verbosity_reward/mean": -1.2723393440246582, "rewards/thinking_verbosity_reward/std": 0.25051143765449524, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 644.0234375, "epoch": 0.21315192743764172, "grad_norm": 0.019968464970588684, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 24097072.0, "rewards/KL_reward/mean": -0.0054155196994543076, "rewards/KL_reward/std": 0.006106829270720482, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.005951676517724991, "rewards/angle_reward/std": 0.7312422394752502, "rewards/thinking_verbosity_reward/mean": -1.2448339462280273, "rewards/thinking_verbosity_reward/std": 0.2416672259569168, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 635.1796875, "epoch": 0.21428571428571427, "grad_norm": 0.018636178225278854, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 24209991.0, "rewards/KL_reward/mean": -0.006905479356646538, "rewards/KL_reward/std": 0.007862304337322712, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": -0.023817699402570724, "rewards/angle_reward/std": 0.7048460841178894, "rewards/thinking_verbosity_reward/mean": -1.231690764427185, "rewards/thinking_verbosity_reward/std": 0.2624240815639496, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 660.0546875, "epoch": 0.21541950113378686, "grad_norm": 0.018882103264331818, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 24326934.0, "rewards/KL_reward/mean": -0.006683792918920517, "rewards/KL_reward/std": 0.007892372086644173, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.006928920745849609, "rewards/angle_reward/std": 0.7000882029533386, "rewards/thinking_verbosity_reward/mean": -1.244320034980774, "rewards/thinking_verbosity_reward/std": 0.3162398934364319, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 713.0703125, "epoch": 0.2165532879818594, "grad_norm": 0.01985430158674717, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 24449231.0, "rewards/KL_reward/mean": -0.006377742625772953, "rewards/KL_reward/std": 0.006911171600222588, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": -0.02648763358592987, "rewards/angle_reward/std": 0.7328572869300842, "rewards/thinking_verbosity_reward/mean": -1.3061573505401611, "rewards/thinking_verbosity_reward/std": 0.273336261510849, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 650.5546875, "epoch": 0.21768707482993196, "grad_norm": 0.02458028309047222, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 24564926.0, "rewards/KL_reward/mean": -0.005544313229620457, "rewards/KL_reward/std": 0.005503201391547918, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.05760467052459717, "rewards/angle_reward/std": 0.7331093549728394, "rewards/thinking_verbosity_reward/mean": -1.2320880889892578, "rewards/thinking_verbosity_reward/std": 0.32661738991737366, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 669.6953125, "epoch": 0.21882086167800455, "grad_norm": 0.01643040031194687, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 24682391.0, "rewards/KL_reward/mean": -0.006706792861223221, "rewards/KL_reward/std": 0.007362376432865858, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.03348740190267563, "rewards/angle_reward/std": 0.7014281749725342, "rewards/thinking_verbosity_reward/mean": -1.2702393531799316, "rewards/thinking_verbosity_reward/std": 0.24225826561450958, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 633.5234375, "epoch": 0.2199546485260771, "grad_norm": 0.018671313300728798, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 24795274.0, "rewards/KL_reward/mean": -0.0069778538309037685, "rewards/KL_reward/std": 0.007189198397099972, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.0068779680877923965, "rewards/angle_reward/std": 0.7145563364028931, "rewards/thinking_verbosity_reward/mean": -1.2367829084396362, "rewards/thinking_verbosity_reward/std": 0.22817501425743103, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 744.015625, "epoch": 0.22108843537414966, "grad_norm": 0.01673026755452156, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 24922652.0, "rewards/KL_reward/mean": -0.005742833949625492, "rewards/KL_reward/std": 0.005718303844332695, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.051033779978752136, "rewards/angle_reward/std": 0.7201513051986694, "rewards/thinking_verbosity_reward/mean": -1.3313870429992676, "rewards/thinking_verbosity_reward/std": 0.2926025390625, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 608.9765625, "epoch": 0.2222222222222222, "grad_norm": 0.01862356625497341, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 25032849.0, "rewards/KL_reward/mean": -0.006777351722121239, "rewards/KL_reward/std": 0.007024673279374838, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": -0.097476065158844, "rewards/angle_reward/std": 0.7260298728942871, "rewards/thinking_verbosity_reward/mean": -1.206028699874878, "rewards/thinking_verbosity_reward/std": 0.25673383474349976, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 657.9921875, "epoch": 0.2233560090702948, "grad_norm": 0.020741797983646393, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 25149072.0, "rewards/KL_reward/mean": -0.007296534720808268, "rewards/KL_reward/std": 0.00537205720320344, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.03954760730266571, "rewards/angle_reward/std": 0.6764101982116699, "rewards/thinking_verbosity_reward/mean": -1.2598237991333008, "rewards/thinking_verbosity_reward/std": 0.23613522946834564, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 728.046875, "epoch": 0.22448979591836735, "grad_norm": 0.01808641105890274, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 25274166.0, "rewards/KL_reward/mean": -0.005314648151397705, "rewards/KL_reward/std": 0.005181093234568834, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.45867621898651123, "rewards/angle_reward/mean": -0.05089155212044716, "rewards/angle_reward/std": 0.6896182894706726, "rewards/thinking_verbosity_reward/mean": -1.3157517910003662, "rewards/thinking_verbosity_reward/std": 0.29521846771240234, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 676.8828125, "epoch": 0.2256235827664399, "grad_norm": 0.018155431374907494, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 25392647.0, "rewards/KL_reward/mean": -0.0088615408167243, "rewards/KL_reward/std": 0.008958039805293083, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.017155885696411133, "rewards/angle_reward/std": 0.7221205234527588, "rewards/thinking_verbosity_reward/mean": -1.254866600036621, "rewards/thinking_verbosity_reward/std": 0.3405318260192871, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 631.78125, "epoch": 0.22675736961451248, "grad_norm": 0.01933540403842926, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 25505107.0, "rewards/KL_reward/mean": -0.007805454544723034, "rewards/KL_reward/std": 0.006577686406672001, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": 0.09517930448055267, "rewards/angle_reward/std": 0.7301434278488159, "rewards/thinking_verbosity_reward/mean": -1.2178688049316406, "rewards/thinking_verbosity_reward/std": 0.30729082226753235, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 627.984375, "epoch": 0.22789115646258504, "grad_norm": 0.020539090037345886, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 25618113.0, "rewards/KL_reward/mean": -0.00769702298566699, "rewards/KL_reward/std": 0.005597667768597603, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.05146237462759018, "rewards/angle_reward/std": 0.7511812448501587, "rewards/thinking_verbosity_reward/mean": -1.2318042516708374, "rewards/thinking_verbosity_reward/std": 0.22466543316841125, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 630.4140625, "epoch": 0.2290249433106576, "grad_norm": 0.019574997946619987, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 25730702.0, "rewards/KL_reward/mean": -0.00823692511767149, "rewards/KL_reward/std": 0.010290653444826603, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": -0.009376266971230507, "rewards/angle_reward/std": 0.7389089465141296, "rewards/thinking_verbosity_reward/mean": -1.2259137630462646, "rewards/thinking_verbosity_reward/std": 0.26688382029533386, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 568.375, "epoch": 0.23015873015873015, "grad_norm": 0.021541284397244453, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 25835294.0, "rewards/KL_reward/mean": -0.008544719778001308, "rewards/KL_reward/std": 0.007308653090149164, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.06592728942632675, "rewards/angle_reward/std": 0.7116559743881226, "rewards/thinking_verbosity_reward/mean": -1.162217140197754, "rewards/thinking_verbosity_reward/std": 0.26111775636672974, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 715.265625, "epoch": 0.23129251700680273, "grad_norm": 0.01561558898538351, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 25958592.0, "rewards/KL_reward/mean": -0.006122298073023558, "rewards/KL_reward/std": 0.005869515240192413, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": -0.055072057992219925, "rewards/angle_reward/std": 0.7151932716369629, "rewards/thinking_verbosity_reward/mean": -1.3064992427825928, "rewards/thinking_verbosity_reward/std": 0.28168007731437683, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 712.671875, "epoch": 0.23242630385487528, "grad_norm": 0.019142666831612587, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 26081982.0, "rewards/KL_reward/mean": -0.006161075085401535, "rewards/KL_reward/std": 0.006036452483385801, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.04611802101135254, "rewards/angle_reward/std": 0.7256365418434143, "rewards/thinking_verbosity_reward/mean": -1.3043410778045654, "rewards/thinking_verbosity_reward/std": 0.28012052178382874, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 730.2421875, "epoch": 0.23356009070294784, "grad_norm": 0.016479630023241043, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 26207229.0, "rewards/KL_reward/mean": -0.00602799654006958, "rewards/KL_reward/std": 0.0058203889057040215, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.03317299112677574, "rewards/angle_reward/std": 0.6820391416549683, "rewards/thinking_verbosity_reward/mean": -1.3224318027496338, "rewards/thinking_verbosity_reward/std": 0.2736136019229889, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 738.328125, "epoch": 0.23469387755102042, "grad_norm": 0.015681680291891098, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 26333447.0, "rewards/KL_reward/mean": -0.005454889498651028, "rewards/KL_reward/std": 0.00478137843310833, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": 0.024364903569221497, "rewards/angle_reward/std": 0.7025476098060608, "rewards/thinking_verbosity_reward/mean": -1.340835452079773, "rewards/thinking_verbosity_reward/std": 0.21396200358867645, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 587.4765625, "epoch": 0.23582766439909297, "grad_norm": 0.019554290920495987, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 26440188.0, "rewards/KL_reward/mean": -0.006931736133992672, "rewards/KL_reward/std": 0.005958786234259605, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.03052525222301483, "rewards/angle_reward/std": 0.7194740772247314, "rewards/thinking_verbosity_reward/mean": -1.1897162199020386, "rewards/thinking_verbosity_reward/std": 0.22624364495277405, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 592.21875, "epoch": 0.23696145124716553, "grad_norm": 0.020028211176395416, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 26548352.0, "rewards/KL_reward/mean": -0.007310900837182999, "rewards/KL_reward/std": 0.00627494789659977, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.03306598216295242, "rewards/angle_reward/std": 0.7165825366973877, "rewards/thinking_verbosity_reward/mean": -1.190523386001587, "rewards/thinking_verbosity_reward/std": 0.24726401269435883, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 634.4140625, "epoch": 0.23809523809523808, "grad_norm": 0.015789972618222237, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 26661957.0, "rewards/KL_reward/mean": -0.007268001325428486, "rewards/KL_reward/std": 0.006919928826391697, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": -0.01383179146796465, "rewards/angle_reward/std": 0.6974374651908875, "rewards/thinking_verbosity_reward/mean": -1.238418698310852, "rewards/thinking_verbosity_reward/std": 0.22415798902511597, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 711.7421875, "epoch": 0.23922902494331066, "grad_norm": 0.016343258321285248, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 26785052.0, "rewards/KL_reward/mean": -0.00694778747856617, "rewards/KL_reward/std": 0.0060117426328361034, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": 0.012593336403369904, "rewards/angle_reward/std": 0.7179101705551147, "rewards/thinking_verbosity_reward/mean": -1.2946412563323975, "rewards/thinking_verbosity_reward/std": 0.31867337226867676, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 576.5, "epoch": 0.24036281179138322, "grad_norm": 0.01915968768298626, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 26890964.0, "rewards/KL_reward/mean": -0.0069218226708471775, "rewards/KL_reward/std": 0.0060286265797913074, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.041994426399469376, "rewards/angle_reward/std": 0.7082900404930115, "rewards/thinking_verbosity_reward/mean": -1.1769347190856934, "rewards/thinking_verbosity_reward/std": 0.23232969641685486, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 584.6875, "epoch": 0.24149659863945577, "grad_norm": 0.02144310250878334, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 26997972.0, "rewards/KL_reward/mean": -0.00726801622658968, "rewards/KL_reward/std": 0.006610418204218149, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.05485362187027931, "rewards/angle_reward/std": 0.6939930319786072, "rewards/thinking_verbosity_reward/mean": -1.1847119331359863, "rewards/thinking_verbosity_reward/std": 0.2368851751089096, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 699.6015625, "epoch": 0.24263038548752835, "grad_norm": 0.018099481239914894, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 27119977.0, "rewards/KL_reward/mean": -0.0071920109912753105, "rewards/KL_reward/std": 0.005825213622301817, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": 0.05808936432003975, "rewards/angle_reward/std": 0.6950080990791321, "rewards/thinking_verbosity_reward/mean": -1.2834217548370361, "rewards/thinking_verbosity_reward/std": 0.3164091110229492, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 672.8984375, "epoch": 0.2437641723356009, "grad_norm": 0.01659393310546875, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 27238180.0, "rewards/KL_reward/mean": -0.007104963064193726, "rewards/KL_reward/std": 0.006704287137836218, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": 0.05779623985290527, "rewards/angle_reward/std": 0.68217533826828, "rewards/thinking_verbosity_reward/mean": -1.2643368244171143, "rewards/thinking_verbosity_reward/std": 0.2861078679561615, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 625.046875, "epoch": 0.24489795918367346, "grad_norm": 0.02007698453962803, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 27349314.0, "rewards/KL_reward/mean": -0.008289286866784096, "rewards/KL_reward/std": 0.007536042481660843, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.45867621898651123, "rewards/angle_reward/mean": -0.006564207375049591, "rewards/angle_reward/std": 0.6777621507644653, "rewards/thinking_verbosity_reward/mean": -1.2202589511871338, "rewards/thinking_verbosity_reward/std": 0.2676292657852173, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 649.1640625, "epoch": 0.24603174603174602, "grad_norm": 0.019388757646083832, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 27464199.0, "rewards/KL_reward/mean": -0.0074159977957606316, "rewards/KL_reward/std": 0.007599027827382088, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.46208351850509644, "rewards/angle_reward/mean": -0.041372545063495636, "rewards/angle_reward/std": 0.7221158146858215, "rewards/thinking_verbosity_reward/mean": -1.249146580696106, "rewards/thinking_verbosity_reward/std": 0.24595995247364044, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 688.09375, "epoch": 0.2471655328798186, "grad_norm": 0.020711416378617287, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 27583491.0, "rewards/KL_reward/mean": -0.008757997304201126, "rewards/KL_reward/std": 0.008113629184663296, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": 0.049838222563266754, "rewards/angle_reward/std": 0.7196351885795593, "rewards/thinking_verbosity_reward/mean": -1.2596156597137451, "rewards/thinking_verbosity_reward/std": 0.3635037839412689, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 687.9140625, "epoch": 0.24829931972789115, "grad_norm": 0.019323069602251053, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 27703360.0, "rewards/KL_reward/mean": -0.006892567500472069, "rewards/KL_reward/std": 0.006044676527380943, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": -0.059788353741168976, "rewards/angle_reward/std": 0.734419584274292, "rewards/thinking_verbosity_reward/mean": -1.2848849296569824, "rewards/thinking_verbosity_reward/std": 0.25860872864723206, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 574.546875, "epoch": 0.2494331065759637, "grad_norm": 0.019302789121866226, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 27809030.0, "rewards/KL_reward/mean": -0.006590262986719608, "rewards/KL_reward/std": 0.006840670946985483, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44340085983276367, "rewards/angle_reward/mean": -0.0019294023513793945, "rewards/angle_reward/std": 0.7161747217178345, "rewards/thinking_verbosity_reward/mean": -1.1663904190063477, "rewards/thinking_verbosity_reward/std": 0.2719900608062744, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 689.09375, "epoch": 0.25056689342403626, "grad_norm": 0.01757318712770939, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 27928754.0, "rewards/KL_reward/mean": -0.007285448722541332, "rewards/KL_reward/std": 0.006790507584810257, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": 0.013903097249567509, "rewards/angle_reward/std": 0.7119504809379578, "rewards/thinking_verbosity_reward/mean": -1.2859375476837158, "rewards/thinking_verbosity_reward/std": 0.25896739959716797, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 686.34375, "epoch": 0.25170068027210885, "grad_norm": 0.01846483163535595, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 28048862.0, "rewards/KL_reward/mean": -0.007576250471174717, "rewards/KL_reward/std": 0.0076942844316363335, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.010099878534674644, "rewards/angle_reward/std": 0.7031623125076294, "rewards/thinking_verbosity_reward/mean": -1.2700605392456055, "rewards/thinking_verbosity_reward/std": 0.31792449951171875, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 668.4453125, "epoch": 0.2528344671201814, "grad_norm": 0.01812918856739998, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 28166479.0, "rewards/KL_reward/mean": -0.0062308646738529205, "rewards/KL_reward/std": 0.006511870305985212, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": 0.050949279218912125, "rewards/angle_reward/std": 0.705336332321167, "rewards/thinking_verbosity_reward/mean": -1.2666432857513428, "rewards/thinking_verbosity_reward/std": 0.2544286251068115, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 646.8359375, "epoch": 0.25396825396825395, "grad_norm": 0.022773467004299164, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 28281346.0, "rewards/KL_reward/mean": -0.006268291734158993, "rewards/KL_reward/std": 0.007012519519776106, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.447474867105484, "rewards/angle_reward/mean": -0.10227032750844955, "rewards/angle_reward/std": 0.7190137505531311, "rewards/thinking_verbosity_reward/mean": -1.248525857925415, "rewards/thinking_verbosity_reward/std": 0.23709122836589813, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 702.3046875, "epoch": 0.25510204081632654, "grad_norm": 0.020715147256851196, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 28403489.0, "rewards/KL_reward/mean": -0.0062208836898207664, "rewards/KL_reward/std": 0.005619505885988474, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.0026268325746059418, "rewards/angle_reward/std": 0.7080735564231873, "rewards/thinking_verbosity_reward/mean": -1.2969403266906738, "rewards/thinking_verbosity_reward/std": 0.2679326832294464, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 678.3359375, "epoch": 0.2562358276643991, "grad_norm": 0.01669652946293354, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 28521908.0, "rewards/KL_reward/mean": -0.006833111867308617, "rewards/KL_reward/std": 0.006781416945159435, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": -0.009288817644119263, "rewards/angle_reward/std": 0.711402177810669, "rewards/thinking_verbosity_reward/mean": -1.2574052810668945, "rewards/thinking_verbosity_reward/std": 0.33632826805114746, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 662.515625, "epoch": 0.25736961451247165, "grad_norm": 0.0172143392264843, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 28639022.0, "rewards/KL_reward/mean": -0.006038540508598089, "rewards/KL_reward/std": 0.006242460571229458, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.006223671138286591, "rewards/angle_reward/std": 0.7244890332221985, "rewards/thinking_verbosity_reward/mean": -1.2689580917358398, "rewards/thinking_verbosity_reward/std": 0.20937541127204895, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 671.0234375, "epoch": 0.2585034013605442, "grad_norm": 0.01777282916009426, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 28756705.0, "rewards/KL_reward/mean": -0.006924336310476065, "rewards/KL_reward/std": 0.006979053374379873, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.003234894946217537, "rewards/angle_reward/std": 0.7178221344947815, "rewards/thinking_verbosity_reward/mean": -1.2622389793395996, "rewards/thinking_verbosity_reward/std": 0.2871507406234741, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 714.59375, "epoch": 0.25963718820861675, "grad_norm": 0.015465959906578064, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 28880077.0, "rewards/KL_reward/mean": -0.006152710411697626, "rewards/KL_reward/std": 0.0048719304613769054, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.05258213356137276, "rewards/angle_reward/std": 0.7208319306373596, "rewards/thinking_verbosity_reward/mean": -1.3186596632003784, "rewards/thinking_verbosity_reward/std": 0.21306687593460083, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 623.875, "epoch": 0.26077097505668934, "grad_norm": 0.01664859429001808, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 28991597.0, "rewards/KL_reward/mean": -0.006472930312156677, "rewards/KL_reward/std": 0.0078015257604420185, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.11813303828239441, "rewards/angle_reward/std": 0.6910309195518494, "rewards/thinking_verbosity_reward/mean": -1.2249972820281982, "rewards/thinking_verbosity_reward/std": 0.23877747356891632, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 656.6640625, "epoch": 0.2619047619047619, "grad_norm": 0.014789329841732979, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 29107346.0, "rewards/KL_reward/mean": -0.00707493769004941, "rewards/KL_reward/std": 0.007159274537116289, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.018108312040567398, "rewards/angle_reward/std": 0.6884683966636658, "rewards/thinking_verbosity_reward/mean": -1.259082555770874, "rewards/thinking_verbosity_reward/std": 0.2328866571187973, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 664.0390625, "epoch": 0.26303854875283444, "grad_norm": 0.01746738702058792, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 29223791.0, "rewards/KL_reward/mean": -0.006290224380791187, "rewards/KL_reward/std": 0.005722086876630783, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": -0.02040044218301773, "rewards/angle_reward/std": 0.6894627809524536, "rewards/thinking_verbosity_reward/mean": -1.2532029151916504, "rewards/thinking_verbosity_reward/std": 0.2962525188922882, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 690.0078125, "epoch": 0.264172335600907, "grad_norm": 0.01679716631770134, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 29344472.0, "rewards/KL_reward/mean": -0.006983471103012562, "rewards/KL_reward/std": 0.005662387702614069, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": 0.028897471725940704, "rewards/angle_reward/std": 0.7165486812591553, "rewards/thinking_verbosity_reward/mean": -1.2830872535705566, "rewards/thinking_verbosity_reward/std": 0.27706924080848694, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 563.6171875, "epoch": 0.2653061224489796, "grad_norm": 0.019079288467764854, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 29449143.0, "rewards/KL_reward/mean": -0.008397059515118599, "rewards/KL_reward/std": 0.008088743314146996, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.021606430411338806, "rewards/angle_reward/std": 0.7255200743675232, "rewards/thinking_verbosity_reward/mean": -1.1630661487579346, "rewards/thinking_verbosity_reward/std": 0.23281878232955933, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 681.578125, "epoch": 0.26643990929705214, "grad_norm": 0.01799463853240013, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 29568177.0, "rewards/KL_reward/mean": -0.006513974629342556, "rewards/KL_reward/std": 0.0063631427474319935, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.05794283002614975, "rewards/angle_reward/std": 0.7300118803977966, "rewards/thinking_verbosity_reward/mean": -1.268895149230957, "rewards/thinking_verbosity_reward/std": 0.30339720845222473, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 643.3984375, "epoch": 0.2675736961451247, "grad_norm": 0.016560139134526253, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 29682316.0, "rewards/KL_reward/mean": -0.007001581601798534, "rewards/KL_reward/std": 0.007529091089963913, "rewards/accuracy_reward/mean": 0.3515625, "rewards/accuracy_reward/std": 0.4793342351913452, "rewards/angle_reward/mean": -0.009729281067848206, "rewards/angle_reward/std": 0.654257595539093, "rewards/thinking_verbosity_reward/mean": -1.2387820482254028, "rewards/thinking_verbosity_reward/std": 0.26832306385040283, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 616.9296875, "epoch": 0.2687074829931973, "grad_norm": 0.01996929757297039, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 29793427.0, "rewards/KL_reward/mean": -0.008148357272148132, "rewards/KL_reward/std": 0.006440743338316679, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": 0.01599818468093872, "rewards/angle_reward/std": 0.7043137550354004, "rewards/thinking_verbosity_reward/mean": -1.2252111434936523, "rewards/thinking_verbosity_reward/std": 0.1973995566368103, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 707.1640625, "epoch": 0.2698412698412698, "grad_norm": 0.01589009165763855, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 29916136.0, "rewards/KL_reward/mean": -0.007304298225790262, "rewards/KL_reward/std": 0.007028522901237011, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": -0.022989019751548767, "rewards/angle_reward/std": 0.7293647527694702, "rewards/thinking_verbosity_reward/mean": -1.2988755702972412, "rewards/thinking_verbosity_reward/std": 0.28087329864501953, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 608.546875, "epoch": 0.2709750566893424, "grad_norm": 0.020093601197004318, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 30025806.0, "rewards/KL_reward/mean": -0.007500161416828632, "rewards/KL_reward/std": 0.006967926397919655, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.0615442618727684, "rewards/angle_reward/std": 0.7171436548233032, "rewards/thinking_verbosity_reward/mean": -1.215866208076477, "rewards/thinking_verbosity_reward/std": 0.20203055441379547, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 687.5390625, "epoch": 0.272108843537415, "grad_norm": 0.019968822598457336, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 30146115.0, "rewards/KL_reward/mean": -0.0064809489995241165, "rewards/KL_reward/std": 0.006810910999774933, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": -0.010318879038095474, "rewards/angle_reward/std": 0.732631504535675, "rewards/thinking_verbosity_reward/mean": -1.2788469791412354, "rewards/thinking_verbosity_reward/std": 0.2855779230594635, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 705.3984375, "epoch": 0.2732426303854875, "grad_norm": 0.02054375596344471, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 30268374.0, "rewards/KL_reward/mean": -0.007403380237519741, "rewards/KL_reward/std": 0.007512849755585194, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.007134806364774704, "rewards/angle_reward/std": 0.7200664281845093, "rewards/thinking_verbosity_reward/mean": -1.2822654247283936, "rewards/thinking_verbosity_reward/std": 0.34311890602111816, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 656.6171875, "epoch": 0.2743764172335601, "grad_norm": 0.018039554357528687, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 30384125.0, "rewards/KL_reward/mean": -0.008808700367808342, "rewards/KL_reward/std": 0.007601337973028421, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.059877797961235046, "rewards/angle_reward/std": 0.6769348978996277, "rewards/thinking_verbosity_reward/mean": -1.2398130893707275, "rewards/thinking_verbosity_reward/std": 0.32048264145851135, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 558.1875, "epoch": 0.2755102040816326, "grad_norm": 0.01742182858288288, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 30487541.0, "rewards/KL_reward/mean": -0.009268783032894135, "rewards/KL_reward/std": 0.009539565071463585, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.01601773127913475, "rewards/angle_reward/std": 0.7038976550102234, "rewards/thinking_verbosity_reward/mean": -1.160976767539978, "rewards/thinking_verbosity_reward/std": 0.21325227618217468, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 638.0546875, "epoch": 0.2766439909297052, "grad_norm": 0.02026633732020855, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 30600956.0, "rewards/KL_reward/mean": -0.007554056588560343, "rewards/KL_reward/std": 0.007229703012853861, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.194504976272583, "rewards/angle_reward/mean": -0.03071395494043827, "rewards/angle_reward/std": 0.7103466987609863, "rewards/thinking_verbosity_reward/mean": -1.2354111671447754, "rewards/thinking_verbosity_reward/std": 0.2587624788284302, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 605.96875, "epoch": 0.2777777777777778, "grad_norm": 0.02038923092186451, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 30710352.0, "rewards/KL_reward/mean": -0.008146708831191063, "rewards/KL_reward/std": 0.008665196597576141, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.45867621898651123, "rewards/angle_reward/mean": -0.035703569650650024, "rewards/angle_reward/std": 0.7312946319580078, "rewards/thinking_verbosity_reward/mean": -1.197398066520691, "rewards/thinking_verbosity_reward/std": 0.28156572580337524, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 633.4453125, "epoch": 0.2789115646258503, "grad_norm": 0.018543001264333725, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 30823161.0, "rewards/KL_reward/mean": -0.0074415747076272964, "rewards/KL_reward/std": 0.00573266576975584, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.05781393498182297, "rewards/angle_reward/std": 0.7158551216125488, "rewards/thinking_verbosity_reward/mean": -1.2390161752700806, "rewards/thinking_verbosity_reward/std": 0.2151612788438797, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 643.2421875, "epoch": 0.2800453514739229, "grad_norm": 0.01924288645386696, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 30937840.0, "rewards/KL_reward/mean": -0.007157555781304836, "rewards/KL_reward/std": 0.00579442223533988, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.005112181417644024, "rewards/angle_reward/std": 0.7062931656837463, "rewards/thinking_verbosity_reward/mean": -1.250734806060791, "rewards/thinking_verbosity_reward/std": 0.20379839837551117, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 614.3046875, "epoch": 0.2811791383219955, "grad_norm": 0.02079552412033081, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 31048727.0, "rewards/KL_reward/mean": -0.008803189732134342, "rewards/KL_reward/std": 0.008303167298436165, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": -0.02736246958374977, "rewards/angle_reward/std": 0.7156417369842529, "rewards/thinking_verbosity_reward/mean": -1.2185291051864624, "rewards/thinking_verbosity_reward/std": 0.22128619253635406, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 686.7734375, "epoch": 0.282312925170068, "grad_norm": 0.015999827533960342, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 31168642.0, "rewards/KL_reward/mean": -0.005788822192698717, "rewards/KL_reward/std": 0.005088903941214085, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": 0.09513027220964432, "rewards/angle_reward/std": 0.708631694316864, "rewards/thinking_verbosity_reward/mean": -1.287979006767273, "rewards/thinking_verbosity_reward/std": 0.2363952398300171, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 676.6953125, "epoch": 0.2834467120181406, "grad_norm": 0.016545815393328667, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 31287899.0, "rewards/KL_reward/mean": -0.007248513400554657, "rewards/KL_reward/std": 0.007056637667119503, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/angle_reward/mean": 0.050945453345775604, "rewards/angle_reward/std": 0.6914721131324768, "rewards/thinking_verbosity_reward/mean": -1.2786433696746826, "rewards/thinking_verbosity_reward/std": 0.23383289575576782, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 607.78125, "epoch": 0.28458049886621317, "grad_norm": 0.01953643001616001, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 31397447.0, "rewards/KL_reward/mean": -0.008097594603896141, "rewards/KL_reward/std": 0.00676872069016099, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": 0.00671498104929924, "rewards/angle_reward/std": 0.6774688959121704, "rewards/thinking_verbosity_reward/mean": -1.2069485187530518, "rewards/thinking_verbosity_reward/std": 0.24637892842292786, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 727.6953125, "epoch": 0.2857142857142857, "grad_norm": 0.015975242480635643, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 31522440.0, "rewards/KL_reward/mean": -0.00571840675547719, "rewards/KL_reward/std": 0.006018051877617836, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": 0.026372039690613747, "rewards/angle_reward/std": 0.6953475475311279, "rewards/thinking_verbosity_reward/mean": -1.3279554843902588, "rewards/thinking_verbosity_reward/std": 0.23171544075012207, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 671.3671875, "epoch": 0.2868480725623583, "grad_norm": 0.017872359603643417, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 31639087.0, "rewards/KL_reward/mean": -0.006082674488425255, "rewards/KL_reward/std": 0.0052271937020123005, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.054664481431245804, "rewards/angle_reward/std": 0.7107526063919067, "rewards/thinking_verbosity_reward/mean": -1.2752413749694824, "rewards/thinking_verbosity_reward/std": 0.22371646761894226, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 580.8046875, "epoch": 0.28798185941043086, "grad_norm": 0.02044520527124405, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 31745310.0, "rewards/KL_reward/mean": -0.006792373023927212, "rewards/KL_reward/std": 0.006417332217097282, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": -0.07211057841777802, "rewards/angle_reward/std": 0.7001871466636658, "rewards/thinking_verbosity_reward/mean": -1.1845359802246094, "rewards/thinking_verbosity_reward/std": 0.21612995862960815, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 636.625, "epoch": 0.2891156462585034, "grad_norm": 0.018003787845373154, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 31858366.0, "rewards/KL_reward/mean": -0.007784110493957996, "rewards/KL_reward/std": 0.007525305729359388, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.045843563973903656, "rewards/angle_reward/std": 0.7304276823997498, "rewards/thinking_verbosity_reward/mean": -1.2383641004562378, "rewards/thinking_verbosity_reward/std": 0.2365047037601471, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 630.3046875, "epoch": 0.29024943310657597, "grad_norm": 0.01779160089790821, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 31971045.0, "rewards/KL_reward/mean": -0.006721969693899155, "rewards/KL_reward/std": 0.00615214416757226, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.04389385133981705, "rewards/angle_reward/std": 0.6879055500030518, "rewards/thinking_verbosity_reward/mean": -1.2370407581329346, "rewards/thinking_verbosity_reward/std": 0.20806531608104706, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 663.0859375, "epoch": 0.29138321995464855, "grad_norm": 0.016839314252138138, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 32087152.0, "rewards/KL_reward/mean": -0.007783721201121807, "rewards/KL_reward/std": 0.007356081623584032, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": 0.020175354555249214, "rewards/angle_reward/std": 0.7190738916397095, "rewards/thinking_verbosity_reward/mean": -1.258847951889038, "rewards/thinking_verbosity_reward/std": 0.26659345626831055, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 634.453125, "epoch": 0.2925170068027211, "grad_norm": 0.018239034339785576, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 32200514.0, "rewards/KL_reward/mean": -0.007003012113273144, "rewards/KL_reward/std": 0.006014623213559389, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.010274749249219894, "rewards/angle_reward/std": 0.6887149214744568, "rewards/thinking_verbosity_reward/mean": -1.2421730756759644, "rewards/thinking_verbosity_reward/std": 0.20233437418937683, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 707.8125, "epoch": 0.29365079365079366, "grad_norm": 0.014524643309414387, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 32322642.0, "rewards/KL_reward/mean": -0.005313513800501823, "rewards/KL_reward/std": 0.005417239386588335, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": 0.004655953496694565, "rewards/angle_reward/std": 0.7026400566101074, "rewards/thinking_verbosity_reward/mean": -1.2959966659545898, "rewards/thinking_verbosity_reward/std": 0.2969035506248474, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 715.4453125, "epoch": 0.2947845804988662, "grad_norm": 0.01586010307073593, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 32446371.0, "rewards/KL_reward/mean": -0.007117181550711393, "rewards/KL_reward/std": 0.005675325635820627, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": -0.04042618349194527, "rewards/angle_reward/std": 0.6932578682899475, "rewards/thinking_verbosity_reward/mean": -1.318619728088379, "rewards/thinking_verbosity_reward/std": 0.21846811473369598, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 702.765625, "epoch": 0.29591836734693877, "grad_norm": 0.017490733414888382, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 32568389.0, "rewards/KL_reward/mean": -0.006071691866964102, "rewards/KL_reward/std": 0.0064462595619261265, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.06298032402992249, "rewards/angle_reward/std": 0.7073873281478882, "rewards/thinking_verbosity_reward/mean": -1.2703129053115845, "rewards/thinking_verbosity_reward/std": 0.3767041265964508, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 667.46875, "epoch": 0.29705215419501135, "grad_norm": 0.01801498606801033, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 32685761.0, "rewards/KL_reward/mean": -0.007192197255790234, "rewards/KL_reward/std": 0.008388747461140156, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.060268156230449677, "rewards/angle_reward/std": 0.7170228362083435, "rewards/thinking_verbosity_reward/mean": -1.2398436069488525, "rewards/thinking_verbosity_reward/std": 0.36049333214759827, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 763.4453125, "epoch": 0.2981859410430839, "grad_norm": 0.01567150466144085, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 32815362.0, "rewards/KL_reward/mean": -0.0055938586592674255, "rewards/KL_reward/std": 0.006172350607812405, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.07238460332155228, "rewards/angle_reward/std": 0.7077392339706421, "rewards/thinking_verbosity_reward/mean": -1.3383076190948486, "rewards/thinking_verbosity_reward/std": 0.3404884338378906, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 580.0390625, "epoch": 0.29931972789115646, "grad_norm": 0.02081012912094593, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 32921879.0, "rewards/KL_reward/mean": -0.0069402060471475124, "rewards/KL_reward/std": 0.007032873574644327, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": -0.06987656652927399, "rewards/angle_reward/std": 0.7268014550209045, "rewards/thinking_verbosity_reward/mean": -1.1844277381896973, "rewards/thinking_verbosity_reward/std": 0.2123224288225174, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 759.3125, "epoch": 0.30045351473922904, "grad_norm": 0.014871988445520401, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 33051487.0, "rewards/KL_reward/mean": -0.005905265919864178, "rewards/KL_reward/std": 0.005857815034687519, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": 0.04666680470108986, "rewards/angle_reward/std": 0.6774764060974121, "rewards/thinking_verbosity_reward/mean": -1.347033977508545, "rewards/thinking_verbosity_reward/std": 0.2861498296260834, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 666.671875, "epoch": 0.30158730158730157, "grad_norm": 0.019435271620750427, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 33168613.0, "rewards/KL_reward/mean": -0.007942674681544304, "rewards/KL_reward/std": 0.006478943396359682, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": 0.03964770957827568, "rewards/angle_reward/std": 0.7271708250045776, "rewards/thinking_verbosity_reward/mean": -1.2621772289276123, "rewards/thinking_verbosity_reward/std": 0.2675977051258087, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 708.4765625, "epoch": 0.30272108843537415, "grad_norm": 0.0172012560069561, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 33291626.0, "rewards/KL_reward/mean": -0.006565619260072708, "rewards/KL_reward/std": 0.00826906319707632, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": -0.0016555637121200562, "rewards/angle_reward/std": 0.7259076237678528, "rewards/thinking_verbosity_reward/mean": -1.3093411922454834, "rewards/thinking_verbosity_reward/std": 0.23389175534248352, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 673.6796875, "epoch": 0.30385487528344673, "grad_norm": 0.017249425873160362, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 33409025.0, "rewards/KL_reward/mean": -0.005863506346940994, "rewards/KL_reward/std": 0.006127702537924051, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.00578656792640686, "rewards/angle_reward/std": 0.7214607000350952, "rewards/thinking_verbosity_reward/mean": -1.269033432006836, "rewards/thinking_verbosity_reward/std": 0.2679460644721985, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 633.171875, "epoch": 0.30498866213151926, "grad_norm": 0.01901894062757492, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 33522199.0, "rewards/KL_reward/mean": -0.007426909636706114, "rewards/KL_reward/std": 0.006230643484741449, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.046921081840991974, "rewards/angle_reward/std": 0.685775876045227, "rewards/thinking_verbosity_reward/mean": -1.2192673683166504, "rewards/thinking_verbosity_reward/std": 0.3074667751789093, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 689.234375, "epoch": 0.30612244897959184, "grad_norm": 0.01681593805551529, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 33642533.0, "rewards/KL_reward/mean": -0.0071766008622944355, "rewards/KL_reward/std": 0.006169370375573635, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.05446804687380791, "rewards/angle_reward/std": 0.6952854990959167, "rewards/thinking_verbosity_reward/mean": -1.2952487468719482, "rewards/thinking_verbosity_reward/std": 0.207930326461792, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 661.78125, "epoch": 0.3072562358276644, "grad_norm": 0.01800641044974327, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 33759337.0, "rewards/KL_reward/mean": -0.006021884270012379, "rewards/KL_reward/std": 0.0057541439309716225, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": -0.02188066579401493, "rewards/angle_reward/std": 0.6984922885894775, "rewards/thinking_verbosity_reward/mean": -1.2585150003433228, "rewards/thinking_verbosity_reward/std": 0.2620096206665039, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 637.9453125, "epoch": 0.30839002267573695, "grad_norm": 0.016137458384037018, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 33873570.0, "rewards/KL_reward/mean": -0.0077069224789738655, "rewards/KL_reward/std": 0.00656564487144351, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.028969956561923027, "rewards/angle_reward/std": 0.70253586769104, "rewards/thinking_verbosity_reward/mean": -1.243137240409851, "rewards/thinking_verbosity_reward/std": 0.21758389472961426, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 707.7265625, "epoch": 0.30952380952380953, "grad_norm": 0.01791304349899292, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 33996679.0, "rewards/KL_reward/mean": -0.005264196544885635, "rewards/KL_reward/std": 0.005644720047712326, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.0020773429423570633, "rewards/angle_reward/std": 0.7002083659172058, "rewards/thinking_verbosity_reward/mean": -1.2859245538711548, "rewards/thinking_verbosity_reward/std": 0.3378249704837799, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 634.515625, "epoch": 0.31065759637188206, "grad_norm": 0.018295975401997566, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 34109641.0, "rewards/KL_reward/mean": -0.00699696596711874, "rewards/KL_reward/std": 0.006174146663397551, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.027715960517525673, "rewards/angle_reward/std": 0.734915018081665, "rewards/thinking_verbosity_reward/mean": -1.2406665086746216, "rewards/thinking_verbosity_reward/std": 0.21177084743976593, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 701.734375, "epoch": 0.31179138321995464, "grad_norm": 0.01832558587193489, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 34231791.0, "rewards/KL_reward/mean": -0.0055017475970089436, "rewards/KL_reward/std": 0.006083634216338396, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": 0.054435499012470245, "rewards/angle_reward/std": 0.6996498107910156, "rewards/thinking_verbosity_reward/mean": -1.2995705604553223, "rewards/thinking_verbosity_reward/std": 0.2517715096473694, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 568.734375, "epoch": 0.3129251700680272, "grad_norm": 0.02160707302391529, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 34336997.0, "rewards/KL_reward/mean": -0.006332578137516975, "rewards/KL_reward/std": 0.008337794803082943, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.012452768161892891, "rewards/angle_reward/std": 0.6941329836845398, "rewards/thinking_verbosity_reward/mean": -1.1567527055740356, "rewards/thinking_verbosity_reward/std": 0.2861800491809845, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 620.0703125, "epoch": 0.31405895691609975, "grad_norm": 0.020352918654680252, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 34448878.0, "rewards/KL_reward/mean": -0.007475889287889004, "rewards/KL_reward/std": 0.007057628594338894, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": -0.027694180607795715, "rewards/angle_reward/std": 0.7493412494659424, "rewards/thinking_verbosity_reward/mean": -1.2272380590438843, "rewards/thinking_verbosity_reward/std": 0.20462197065353394, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 690.4375, "epoch": 0.31519274376417233, "grad_norm": 0.01698232814669609, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 34569086.0, "rewards/KL_reward/mean": -0.006635461002588272, "rewards/KL_reward/std": 0.005810300353914499, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": 0.04517771303653717, "rewards/angle_reward/std": 0.7285328507423401, "rewards/thinking_verbosity_reward/mean": -1.288184642791748, "rewards/thinking_verbosity_reward/std": 0.25428318977355957, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 713.0390625, "epoch": 0.3163265306122449, "grad_norm": 0.017586758360266685, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 34692219.0, "rewards/KL_reward/mean": -0.006374924443662167, "rewards/KL_reward/std": 0.005685480311512947, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.09032676368951797, "rewards/angle_reward/std": 0.68100905418396, "rewards/thinking_verbosity_reward/mean": -1.3132407665252686, "rewards/thinking_verbosity_reward/std": 0.23650206625461578, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 682.6015625, "epoch": 0.31746031746031744, "grad_norm": 0.01601078175008297, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 34811512.0, "rewards/KL_reward/mean": -0.0063264500349760056, "rewards/KL_reward/std": 0.006804726552218199, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": -0.014285329729318619, "rewards/angle_reward/std": 0.7129133343696594, "rewards/thinking_verbosity_reward/mean": -1.2733216285705566, "rewards/thinking_verbosity_reward/std": 0.2886287271976471, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 646.5859375, "epoch": 0.31859410430839, "grad_norm": 0.018729139119386673, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 34925931.0, "rewards/KL_reward/mean": -0.0060574933886528015, "rewards/KL_reward/std": 0.006262896582484245, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45510825514793396, "rewards/angle_reward/mean": -0.03853822499513626, "rewards/angle_reward/std": 0.672224223613739, "rewards/thinking_verbosity_reward/mean": -1.242337703704834, "rewards/thinking_verbosity_reward/std": 0.26664164662361145, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 756.6640625, "epoch": 0.3197278911564626, "grad_norm": 0.01873278245329857, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 35054304.0, "rewards/KL_reward/mean": -0.007486949674785137, "rewards/KL_reward/std": 0.0073571400716900826, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.09576274454593658, "rewards/angle_reward/std": 0.6821129322052002, "rewards/thinking_verbosity_reward/mean": -1.3155295848846436, "rewards/thinking_verbosity_reward/std": 0.39976224303245544, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 622.8515625, "epoch": 0.32086167800453513, "grad_norm": 0.018422694876790047, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 35166341.0, "rewards/KL_reward/mean": -0.008260250091552734, "rewards/KL_reward/std": 0.00703958235681057, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.0013125715777277946, "rewards/angle_reward/std": 0.7073757648468018, "rewards/thinking_verbosity_reward/mean": -1.2233890295028687, "rewards/thinking_verbosity_reward/std": 0.24154826998710632, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 672.703125, "epoch": 0.3219954648526077, "grad_norm": 0.018485447391867638, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 35284727.0, "rewards/KL_reward/mean": -0.00819874182343483, "rewards/KL_reward/std": 0.007637142203748226, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.027087576687335968, "rewards/angle_reward/std": 0.706924319267273, "rewards/thinking_verbosity_reward/mean": -1.2631499767303467, "rewards/thinking_verbosity_reward/std": 0.2903617322444916, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 656.4921875, "epoch": 0.3231292517006803, "grad_norm": 0.01780383102595806, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 35401038.0, "rewards/KL_reward/mean": -0.006157747469842434, "rewards/KL_reward/std": 0.006943391170352697, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.057412780821323395, "rewards/angle_reward/std": 0.7159155011177063, "rewards/thinking_verbosity_reward/mean": -1.257150411605835, "rewards/thinking_verbosity_reward/std": 0.24236872792243958, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 666.65625, "epoch": 0.3242630385487528, "grad_norm": 0.01581946201622486, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 35518610.0, "rewards/KL_reward/mean": -0.005965085234493017, "rewards/KL_reward/std": 0.0054606543853878975, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/angle_reward/mean": 0.0004185568541288376, "rewards/angle_reward/std": 0.6976437568664551, "rewards/thinking_verbosity_reward/mean": -1.2735779285430908, "rewards/thinking_verbosity_reward/std": 0.20594365894794464, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 616.828125, "epoch": 0.3253968253968254, "grad_norm": 0.018370624631643295, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 35629364.0, "rewards/KL_reward/mean": -0.009418688714504242, "rewards/KL_reward/std": 0.00715947849676013, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": 0.003683575429022312, "rewards/angle_reward/std": 0.6758346557617188, "rewards/thinking_verbosity_reward/mean": -1.221147894859314, "rewards/thinking_verbosity_reward/std": 0.2207469344139099, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 645.171875, "epoch": 0.32653061224489793, "grad_norm": 0.02141338214278221, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 35743922.0, "rewards/KL_reward/mean": -0.007814271375536919, "rewards/KL_reward/std": 0.0062730573117733, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.03710462898015976, "rewards/angle_reward/std": 0.7213236093521118, "rewards/thinking_verbosity_reward/mean": -1.2461848258972168, "rewards/thinking_verbosity_reward/std": 0.24051284790039062, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 610.546875, "epoch": 0.3276643990929705, "grad_norm": 0.019021552056074142, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 35853616.0, "rewards/KL_reward/mean": -0.008106883615255356, "rewards/KL_reward/std": 0.00803397037088871, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.4513758420944214, "rewards/angle_reward/mean": -0.014590539038181305, "rewards/angle_reward/std": 0.7261592149734497, "rewards/thinking_verbosity_reward/mean": -1.2160003185272217, "rewards/thinking_verbosity_reward/std": 0.213370218873024, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 633.015625, "epoch": 0.3287981859410431, "grad_norm": 0.021869687363505363, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 35966794.0, "rewards/KL_reward/mean": -0.007401672657579184, "rewards/KL_reward/std": 0.006323895882815123, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.03046243079006672, "rewards/angle_reward/std": 0.6955752968788147, "rewards/thinking_verbosity_reward/mean": -1.2145500183105469, "rewards/thinking_verbosity_reward/std": 0.32519689202308655, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 667.59375, "epoch": 0.3299319727891156, "grad_norm": 0.01596551202237606, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 36084022.0, "rewards/KL_reward/mean": -0.007860099896788597, "rewards/KL_reward/std": 0.00665412750095129, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.029873479157686234, "rewards/angle_reward/std": 0.707763671875, "rewards/thinking_verbosity_reward/mean": -1.268153429031372, "rewards/thinking_verbosity_reward/std": 0.24226154386997223, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 641.1484375, "epoch": 0.3310657596371882, "grad_norm": 0.019610509276390076, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 36198417.0, "rewards/KL_reward/mean": -0.0062166606076061726, "rewards/KL_reward/std": 0.006279591470956802, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.06403964012861252, "rewards/angle_reward/std": 0.7216745615005493, "rewards/thinking_verbosity_reward/mean": -1.248220682144165, "rewards/thinking_verbosity_reward/std": 0.20642216503620148, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 673.6484375, "epoch": 0.3321995464852608, "grad_norm": 0.016013026237487793, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 36316948.0, "rewards/KL_reward/mean": -0.008568549528717995, "rewards/KL_reward/std": 0.007496171165257692, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.035736314952373505, "rewards/angle_reward/std": 0.7004334926605225, "rewards/thinking_verbosity_reward/mean": -1.2675621509552002, "rewards/thinking_verbosity_reward/std": 0.2747315466403961, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 657.9921875, "epoch": 0.3333333333333333, "grad_norm": 0.020345015451312065, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 36432819.0, "rewards/KL_reward/mean": -0.008337104693055153, "rewards/KL_reward/std": 0.007409153506159782, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.09618128091096878, "rewards/angle_reward/std": 0.7162677049636841, "rewards/thinking_verbosity_reward/mean": -1.2603285312652588, "rewards/thinking_verbosity_reward/std": 0.23327921330928802, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 665.0859375, "epoch": 0.3344671201814059, "grad_norm": 0.019598834216594696, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 36549526.0, "rewards/KL_reward/mean": -0.005868074018508196, "rewards/KL_reward/std": 0.005221591331064701, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.46208351850509644, "rewards/angle_reward/mean": -0.03156152367591858, "rewards/angle_reward/std": 0.7383613586425781, "rewards/thinking_verbosity_reward/mean": -1.2608528137207031, "rewards/thinking_verbosity_reward/std": 0.266458660364151, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 680.3125, "epoch": 0.3356009070294785, "grad_norm": 0.01849890686571598, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 36667966.0, "rewards/KL_reward/mean": -0.00805110577493906, "rewards/KL_reward/std": 0.007103241514414549, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/angle_reward/mean": 0.042491309344768524, "rewards/angle_reward/std": 0.699578046798706, "rewards/thinking_verbosity_reward/mean": -1.2727693319320679, "rewards/thinking_verbosity_reward/std": 0.280956506729126, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 635.7734375, "epoch": 0.336734693877551, "grad_norm": 0.020537061616778374, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 36781049.0, "rewards/KL_reward/mean": -0.007871702313423157, "rewards/KL_reward/std": 0.007687455043196678, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.09363602101802826, "rewards/angle_reward/std": 0.7016919255256653, "rewards/thinking_verbosity_reward/mean": -1.2297581434249878, "rewards/thinking_verbosity_reward/std": 0.274255633354187, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 693.84375, "epoch": 0.3378684807256236, "grad_norm": 0.015424701385200024, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 36902181.0, "rewards/KL_reward/mean": -0.0069398339837789536, "rewards/KL_reward/std": 0.005675792694091797, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.010185010731220245, "rewards/angle_reward/std": 0.7017130851745605, "rewards/thinking_verbosity_reward/mean": -1.2912731170654297, "rewards/thinking_verbosity_reward/std": 0.25536859035491943, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 662.3046875, "epoch": 0.33900226757369617, "grad_norm": 0.01764794811606407, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 37018204.0, "rewards/KL_reward/mean": -0.007857094518840313, "rewards/KL_reward/std": 0.007627793122082949, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.017138749361038208, "rewards/angle_reward/std": 0.7161518335342407, "rewards/thinking_verbosity_reward/mean": -1.2537500858306885, "rewards/thinking_verbosity_reward/std": 0.28648269176483154, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 784.5390625, "epoch": 0.3401360544217687, "grad_norm": 0.01640402339398861, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 37150393.0, "rewards/KL_reward/mean": -0.006395349279046059, "rewards/KL_reward/std": 0.006379930768162012, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.049736715853214264, "rewards/angle_reward/std": 0.7273575067520142, "rewards/thinking_verbosity_reward/mean": -1.3761987686157227, "rewards/thinking_verbosity_reward/std": 0.25580403208732605, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 598.3515625, "epoch": 0.3412698412698413, "grad_norm": 0.018060341477394104, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 37257910.0, "rewards/KL_reward/mean": -0.007060700561851263, "rewards/KL_reward/std": 0.006098188925534487, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": -0.007485490292310715, "rewards/angle_reward/std": 0.6995981931686401, "rewards/thinking_verbosity_reward/mean": -1.2047525644302368, "rewards/thinking_verbosity_reward/std": 0.2056220918893814, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 517.0390625, "epoch": 0.3424036281179138, "grad_norm": 0.022523924708366394, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 37355931.0, "rewards/KL_reward/mean": -0.009995004162192345, "rewards/KL_reward/std": 0.00978040136396885, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.4513758420944214, "rewards/angle_reward/mean": 0.062474675476551056, "rewards/angle_reward/std": 0.716263473033905, "rewards/thinking_verbosity_reward/mean": -1.1115931272506714, "rewards/thinking_verbosity_reward/std": 0.23419605195522308, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 586.640625, "epoch": 0.3435374149659864, "grad_norm": 0.02103191800415516, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 37463093.0, "rewards/KL_reward/mean": -0.00888234656304121, "rewards/KL_reward/std": 0.008172755129635334, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": 0.050526078790426254, "rewards/angle_reward/std": 0.7193748950958252, "rewards/thinking_verbosity_reward/mean": -1.1831541061401367, "rewards/thinking_verbosity_reward/std": 0.254467636346817, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 565.203125, "epoch": 0.34467120181405897, "grad_norm": 0.019218802452087402, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 37566719.0, "rewards/KL_reward/mean": -0.009941613301634789, "rewards/KL_reward/std": 0.007637757807970047, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.022061128169298172, "rewards/angle_reward/std": 0.708019495010376, "rewards/thinking_verbosity_reward/mean": -1.1639440059661865, "rewards/thinking_verbosity_reward/std": 0.23694221675395966, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 669.921875, "epoch": 0.3458049886621315, "grad_norm": 0.019459662958979607, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 37684597.0, "rewards/KL_reward/mean": -0.009102117270231247, "rewards/KL_reward/std": 0.008494559675455093, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.024678537622094154, "rewards/angle_reward/std": 0.7370985746383667, "rewards/thinking_verbosity_reward/mean": -1.2622956037521362, "rewards/thinking_verbosity_reward/std": 0.28195133805274963, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 692.65625, "epoch": 0.3469387755102041, "grad_norm": 0.019493240863084793, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 37805657.0, "rewards/KL_reward/mean": -0.008165314793586731, "rewards/KL_reward/std": 0.007228944916278124, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": 0.062268540263175964, "rewards/angle_reward/std": 0.7373337149620056, "rewards/thinking_verbosity_reward/mean": -1.2855594158172607, "rewards/thinking_verbosity_reward/std": 0.2775971591472626, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 663.1640625, "epoch": 0.34807256235827666, "grad_norm": 0.016767730936408043, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 37922774.0, "rewards/KL_reward/mean": -0.008340677246451378, "rewards/KL_reward/std": 0.007052331697195768, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": 0.062498558312654495, "rewards/angle_reward/std": 0.6978163123130798, "rewards/thinking_verbosity_reward/mean": -1.2607831954956055, "rewards/thinking_verbosity_reward/std": 0.2573254406452179, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 635.0546875, "epoch": 0.3492063492063492, "grad_norm": 0.016742372885346413, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 38035365.0, "rewards/KL_reward/mean": -0.010181295685470104, "rewards/KL_reward/std": 0.007941014133393764, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/angle_reward/mean": 0.042019449174404144, "rewards/angle_reward/std": 0.6886358261108398, "rewards/thinking_verbosity_reward/mean": -1.2268184423446655, "rewards/thinking_verbosity_reward/std": 0.2841179370880127, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 658.5078125, "epoch": 0.35034013605442177, "grad_norm": 0.016526585444808006, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 38151198.0, "rewards/KL_reward/mean": -0.0074892486445605755, "rewards/KL_reward/std": 0.0055892588570714, "rewards/accuracy_reward/mean": 0.296875, "rewards/accuracy_reward/std": 0.45867621898651123, "rewards/angle_reward/mean": -0.026805002242326736, "rewards/angle_reward/std": 0.700246274471283, "rewards/thinking_verbosity_reward/mean": -1.2586907148361206, "rewards/thinking_verbosity_reward/std": 0.24481716752052307, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 683.9453125, "epoch": 0.35147392290249435, "grad_norm": 0.0192258283495903, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 38270863.0, "rewards/KL_reward/mean": -0.009363025426864624, "rewards/KL_reward/std": 0.007326322607696056, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.06473156809806824, "rewards/angle_reward/std": 0.7032992243766785, "rewards/thinking_verbosity_reward/mean": -1.2719552516937256, "rewards/thinking_verbosity_reward/std": 0.3002309501171112, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 619.0703125, "epoch": 0.3526077097505669, "grad_norm": 0.016836460679769516, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 38381744.0, "rewards/KL_reward/mean": -0.010077145881950855, "rewards/KL_reward/std": 0.007469221018254757, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.040938571095466614, "rewards/angle_reward/std": 0.7000210285186768, "rewards/thinking_verbosity_reward/mean": -1.2291001081466675, "rewards/thinking_verbosity_reward/std": 0.1862824708223343, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 642.453125, "epoch": 0.35374149659863946, "grad_norm": 0.017571302130818367, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 38496154.0, "rewards/KL_reward/mean": -0.008081937208771706, "rewards/KL_reward/std": 0.006963278166949749, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.05039401724934578, "rewards/angle_reward/std": 0.7186501622200012, "rewards/thinking_verbosity_reward/mean": -1.2407217025756836, "rewards/thinking_verbosity_reward/std": 0.254418283700943, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 714.4375, "epoch": 0.35487528344671204, "grad_norm": 0.017241306602954865, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 38619218.0, "rewards/KL_reward/mean": -0.008873986080288887, "rewards/KL_reward/std": 0.008177191950380802, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.07794732600450516, "rewards/angle_reward/std": 0.720664918422699, "rewards/thinking_verbosity_reward/mean": -1.3019829988479614, "rewards/thinking_verbosity_reward/std": 0.29851409792900085, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 658.84375, "epoch": 0.35600907029478457, "grad_norm": 0.018195878714323044, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 38735758.0, "rewards/KL_reward/mean": -0.008417133241891861, "rewards/KL_reward/std": 0.006822847295552492, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.03660423681139946, "rewards/angle_reward/std": 0.7281025648117065, "rewards/thinking_verbosity_reward/mean": -1.2603774070739746, "rewards/thinking_verbosity_reward/std": 0.2376958131790161, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 695.375, "epoch": 0.35714285714285715, "grad_norm": 0.01648283377289772, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 38856750.0, "rewards/KL_reward/mean": -0.008553637191653252, "rewards/KL_reward/std": 0.007821978069841862, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.021729689091444016, "rewards/angle_reward/std": 0.6876324415206909, "rewards/thinking_verbosity_reward/mean": -1.2965296506881714, "rewards/thinking_verbosity_reward/std": 0.2352479249238968, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 566.4609375, "epoch": 0.35827664399092973, "grad_norm": 0.02313867025077343, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 38961017.0, "rewards/KL_reward/mean": -0.012008454650640488, "rewards/KL_reward/std": 0.008895636536180973, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.07547923922538757, "rewards/angle_reward/std": 0.7204188704490662, "rewards/thinking_verbosity_reward/mean": -1.1683481931686401, "rewards/thinking_verbosity_reward/std": 0.22139836847782135, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 652.84375, "epoch": 0.35941043083900226, "grad_norm": 0.014823821373283863, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 39075997.0, "rewards/KL_reward/mean": -0.008365944027900696, "rewards/KL_reward/std": 0.00788775086402893, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.023123951628804207, "rewards/angle_reward/std": 0.7013049125671387, "rewards/thinking_verbosity_reward/mean": -1.2614336013793945, "rewards/thinking_verbosity_reward/std": 0.19676585495471954, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 653.203125, "epoch": 0.36054421768707484, "grad_norm": 0.017812160775065422, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 39191943.0, "rewards/KL_reward/mean": -0.00916835106909275, "rewards/KL_reward/std": 0.008945772424340248, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.0597342923283577, "rewards/angle_reward/std": 0.7158983945846558, "rewards/thinking_verbosity_reward/mean": -1.2495640516281128, "rewards/thinking_verbosity_reward/std": 0.2638596296310425, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 629.8359375, "epoch": 0.36167800453514737, "grad_norm": 0.016286782920360565, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 39304482.0, "rewards/KL_reward/mean": -0.010274011641740799, "rewards/KL_reward/std": 0.008000952191650867, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22826264798641205, "rewards/angle_reward/mean": -0.008843816816806793, "rewards/angle_reward/std": 0.7008348107337952, "rewards/thinking_verbosity_reward/mean": -1.2296520471572876, "rewards/thinking_verbosity_reward/std": 0.2459651231765747, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 666.015625, "epoch": 0.36281179138321995, "grad_norm": 0.017830902710556984, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 39422132.0, "rewards/KL_reward/mean": -0.009082363918423653, "rewards/KL_reward/std": 0.008023828268051147, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.07964855432510376, "rewards/angle_reward/std": 0.7010746598243713, "rewards/thinking_verbosity_reward/mean": -1.2581864595413208, "rewards/thinking_verbosity_reward/std": 0.28311049938201904, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 690.1640625, "epoch": 0.36394557823129253, "grad_norm": 0.016204338520765305, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 39541873.0, "rewards/KL_reward/mean": -0.00774435605853796, "rewards/KL_reward/std": 0.007772011682391167, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.020165707916021347, "rewards/angle_reward/std": 0.7051674127578735, "rewards/thinking_verbosity_reward/mean": -1.2649781703948975, "rewards/thinking_verbosity_reward/std": 0.35163623094558716, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 714.8828125, "epoch": 0.36507936507936506, "grad_norm": 0.01743881031870842, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 39664826.0, "rewards/KL_reward/mean": -0.008616095408797264, "rewards/KL_reward/std": 0.007796288467943668, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.0778302252292633, "rewards/angle_reward/std": 0.711172878742218, "rewards/thinking_verbosity_reward/mean": -1.306434988975525, "rewards/thinking_verbosity_reward/std": 0.2801944315433502, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 636.0078125, "epoch": 0.36621315192743764, "grad_norm": 0.021684397011995316, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 39778667.0, "rewards/KL_reward/mean": -0.011362764984369278, "rewards/KL_reward/std": 0.00869831908494234, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.029594052582979202, "rewards/angle_reward/std": 0.7153961658477783, "rewards/thinking_verbosity_reward/mean": -1.225907325744629, "rewards/thinking_verbosity_reward/std": 0.2921258807182312, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 598.90625, "epoch": 0.3673469387755102, "grad_norm": 0.01639612577855587, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 39887495.0, "rewards/KL_reward/mean": -0.00996290985494852, "rewards/KL_reward/std": 0.008893659338355064, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.04011102393269539, "rewards/angle_reward/std": 0.6990154385566711, "rewards/thinking_verbosity_reward/mean": -1.1960594654083252, "rewards/thinking_verbosity_reward/std": 0.2544650733470917, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 594.0078125, "epoch": 0.36848072562358275, "grad_norm": 0.020544040948152542, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 39995504.0, "rewards/KL_reward/mean": -0.009360745549201965, "rewards/KL_reward/std": 0.0074703386053442955, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.09173193573951721, "rewards/angle_reward/std": 0.7248408794403076, "rewards/thinking_verbosity_reward/mean": -1.195133924484253, "rewards/thinking_verbosity_reward/std": 0.23354220390319824, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 615.875, "epoch": 0.36961451247165533, "grad_norm": 0.019898978993296623, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 40106392.0, "rewards/KL_reward/mean": -0.009691888466477394, "rewards/KL_reward/std": 0.007954951375722885, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.08430376648902893, "rewards/angle_reward/std": 0.7162787914276123, "rewards/thinking_verbosity_reward/mean": -1.2271376848220825, "rewards/thinking_verbosity_reward/std": 0.17739422619342804, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 569.078125, "epoch": 0.3707482993197279, "grad_norm": 0.019553160294890404, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 40210906.0, "rewards/KL_reward/mean": -0.011003728024661541, "rewards/KL_reward/std": 0.009004923515021801, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": -0.03519893437623978, "rewards/angle_reward/std": 0.71088045835495, "rewards/thinking_verbosity_reward/mean": -1.163379430770874, "rewards/thinking_verbosity_reward/std": 0.25940099358558655, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 581.140625, "epoch": 0.37188208616780044, "grad_norm": 0.019406193867325783, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 40317180.0, "rewards/KL_reward/mean": -0.009035211056470871, "rewards/KL_reward/std": 0.0066587477922439575, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": -0.06995624303817749, "rewards/angle_reward/std": 0.7030081748962402, "rewards/thinking_verbosity_reward/mean": -1.184302568435669, "rewards/thinking_verbosity_reward/std": 0.21930840611457825, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 608.3671875, "epoch": 0.373015873015873, "grad_norm": 0.017711546272039413, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 40426515.0, "rewards/KL_reward/mean": -0.009203861467540264, "rewards/KL_reward/std": 0.007614146452397108, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.0009618657641112804, "rewards/angle_reward/std": 0.6823228001594543, "rewards/thinking_verbosity_reward/mean": -1.21611487865448, "rewards/thinking_verbosity_reward/std": 0.19933529198169708, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 628.4765625, "epoch": 0.3741496598639456, "grad_norm": 0.017541462555527687, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 40538904.0, "rewards/KL_reward/mean": -0.009096905589103699, "rewards/KL_reward/std": 0.009053525514900684, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": 0.00803598016500473, "rewards/angle_reward/std": 0.7005848288536072, "rewards/thinking_verbosity_reward/mean": -1.2112784385681152, "rewards/thinking_verbosity_reward/std": 0.319836288690567, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 670.0078125, "epoch": 0.37528344671201813, "grad_norm": 0.02290179766714573, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 40656385.0, "rewards/KL_reward/mean": -0.009822444058954716, "rewards/KL_reward/std": 0.0084287254139781, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.07108981907367706, "rewards/angle_reward/std": 0.72761070728302, "rewards/thinking_verbosity_reward/mean": -1.272742509841919, "rewards/thinking_verbosity_reward/std": 0.2303491234779358, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 605.453125, "epoch": 0.3764172335600907, "grad_norm": 0.018231753259897232, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 40765707.0, "rewards/KL_reward/mean": -0.010320719331502914, "rewards/KL_reward/std": 0.007579497992992401, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": 0.0481044203042984, "rewards/angle_reward/std": 0.7102060317993164, "rewards/thinking_verbosity_reward/mean": -1.2064979076385498, "rewards/thinking_verbosity_reward/std": 0.2365427017211914, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 585.71875, "epoch": 0.37755102040816324, "grad_norm": 0.01733916811645031, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 40871983.0, "rewards/KL_reward/mean": -0.009395333006978035, "rewards/KL_reward/std": 0.00850164145231247, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": -0.09402258694171906, "rewards/angle_reward/std": 0.6644055247306824, "rewards/thinking_verbosity_reward/mean": -1.1958613395690918, "rewards/thinking_verbosity_reward/std": 0.17861491441726685, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 644.5390625, "epoch": 0.3786848072562358, "grad_norm": 0.015585731714963913, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 40986700.0, "rewards/KL_reward/mean": -0.011141793802380562, "rewards/KL_reward/std": 0.007814460434019566, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.050050366669893265, "rewards/angle_reward/std": 0.6957428455352783, "rewards/thinking_verbosity_reward/mean": -1.2409121990203857, "rewards/thinking_verbosity_reward/std": 0.263643354177475, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 552.734375, "epoch": 0.3798185941043084, "grad_norm": 0.01960836723446846, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 41089434.0, "rewards/KL_reward/mean": -0.014167695306241512, "rewards/KL_reward/std": 0.010362173430621624, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": 0.005596708040684462, "rewards/angle_reward/std": 0.7121492624282837, "rewards/thinking_verbosity_reward/mean": -1.151566743850708, "rewards/thinking_verbosity_reward/std": 0.23162296414375305, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 580.59375, "epoch": 0.38095238095238093, "grad_norm": 0.018973182886838913, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 41194886.0, "rewards/KL_reward/mean": -0.011608713306486607, "rewards/KL_reward/std": 0.01016836054623127, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.09077536314725876, "rewards/angle_reward/std": 0.6637977957725525, "rewards/thinking_verbosity_reward/mean": -1.1720116138458252, "rewards/thinking_verbosity_reward/std": 0.27554944157600403, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 548.703125, "epoch": 0.3820861678004535, "grad_norm": 0.020832985639572144, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 41297000.0, "rewards/KL_reward/mean": -0.01433346327394247, "rewards/KL_reward/std": 0.010334284976124763, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.026250137016177177, "rewards/angle_reward/std": 0.7008341550827026, "rewards/thinking_verbosity_reward/mean": -1.149053692817688, "rewards/thinking_verbosity_reward/std": 0.22190140187740326, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 651.7421875, "epoch": 0.3832199546485261, "grad_norm": 0.016582710668444633, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 41412367.0, "rewards/KL_reward/mean": -0.011677080765366554, "rewards/KL_reward/std": 0.007517748977988958, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": 0.07228608429431915, "rewards/angle_reward/std": 0.6654783487319946, "rewards/thinking_verbosity_reward/mean": -1.2558412551879883, "rewards/thinking_verbosity_reward/std": 0.2238079309463501, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 473.3828125, "epoch": 0.3843537414965986, "grad_norm": 0.027373218908905983, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 41505200.0, "rewards/KL_reward/mean": -0.014650771394371986, "rewards/KL_reward/std": 0.010913332924246788, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.05021342635154724, "rewards/angle_reward/std": 0.6841585040092468, "rewards/thinking_verbosity_reward/mean": -1.0703449249267578, "rewards/thinking_verbosity_reward/std": 0.18872351944446564, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 538.09375, "epoch": 0.3854875283446712, "grad_norm": 0.022517004981637, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 41606260.0, "rewards/KL_reward/mean": -0.013644316233694553, "rewards/KL_reward/std": 0.009786357171833515, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": 0.01600516028702259, "rewards/angle_reward/std": 0.6811925172805786, "rewards/thinking_verbosity_reward/mean": -1.1354541778564453, "rewards/thinking_verbosity_reward/std": 0.2319503128528595, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 647.5234375, "epoch": 0.3866213151927438, "grad_norm": 0.016684694215655327, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 41720591.0, "rewards/KL_reward/mean": -0.010870617814362049, "rewards/KL_reward/std": 0.009292100556194782, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.0134214386343956, "rewards/angle_reward/std": 0.6902464032173157, "rewards/thinking_verbosity_reward/mean": -1.2468688488006592, "rewards/thinking_verbosity_reward/std": 0.24922247231006622, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 565.109375, "epoch": 0.3877551020408163, "grad_norm": 0.01767166331410408, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 41825269.0, "rewards/KL_reward/mean": -0.011611821129918098, "rewards/KL_reward/std": 0.009323826991021633, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.0860278457403183, "rewards/angle_reward/std": 0.6994227170944214, "rewards/thinking_verbosity_reward/mean": -1.1734181642532349, "rewards/thinking_verbosity_reward/std": 0.18337365984916687, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 695.015625, "epoch": 0.3888888888888889, "grad_norm": 0.016172772273421288, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 41946431.0, "rewards/KL_reward/mean": -0.0106268972158432, "rewards/KL_reward/std": 0.00812254287302494, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.055921170860528946, "rewards/angle_reward/std": 0.7074174880981445, "rewards/thinking_verbosity_reward/mean": -1.2934153079986572, "rewards/thinking_verbosity_reward/std": 0.2502100169658661, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 492.84375, "epoch": 0.3900226757369615, "grad_norm": 0.022642863914370537, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 42041259.0, "rewards/KL_reward/mean": -0.013332553207874298, "rewards/KL_reward/std": 0.009947570040822029, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.026306793093681335, "rewards/angle_reward/std": 0.7171652317047119, "rewards/thinking_verbosity_reward/mean": -1.0878281593322754, "rewards/thinking_verbosity_reward/std": 0.21583305299282074, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 582.0078125, "epoch": 0.391156462585034, "grad_norm": 0.021043403074145317, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 42147996.0, "rewards/KL_reward/mean": -0.011846832931041718, "rewards/KL_reward/std": 0.009587462060153484, "rewards/accuracy_reward/mean": 0.3203125, "rewards/accuracy_reward/std": 0.4684300124645233, "rewards/angle_reward/mean": -0.05014696717262268, "rewards/angle_reward/std": 0.7022067904472351, "rewards/thinking_verbosity_reward/mean": -1.1845569610595703, "rewards/thinking_verbosity_reward/std": 0.22296461462974548, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 593.0703125, "epoch": 0.3922902494331066, "grad_norm": 0.018539853394031525, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 42256085.0, "rewards/KL_reward/mean": -0.01160618755966425, "rewards/KL_reward/std": 0.009447838179767132, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": -0.026164177805185318, "rewards/angle_reward/std": 0.7124016284942627, "rewards/thinking_verbosity_reward/mean": -1.19658625125885, "rewards/thinking_verbosity_reward/std": 0.2206772416830063, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 544.828125, "epoch": 0.3934240362811791, "grad_norm": 0.02210315316915512, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 42358007.0, "rewards/KL_reward/mean": -0.013557846657931805, "rewards/KL_reward/std": 0.009934193454682827, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": -0.03684143349528313, "rewards/angle_reward/std": 0.698218047618866, "rewards/thinking_verbosity_reward/mean": -1.1469924449920654, "rewards/thinking_verbosity_reward/std": 0.21053484082221985, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 622.9765625, "epoch": 0.3945578231292517, "grad_norm": 0.017409779131412506, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 42469940.0, "rewards/KL_reward/mean": -0.010706034488976002, "rewards/KL_reward/std": 0.009406552650034428, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.03276388719677925, "rewards/angle_reward/std": 0.6755772829055786, "rewards/thinking_verbosity_reward/mean": -1.2311208248138428, "rewards/thinking_verbosity_reward/std": 0.1989777535200119, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 641.8515625, "epoch": 0.3956916099773243, "grad_norm": 0.016742389649152756, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 42583889.0, "rewards/KL_reward/mean": -0.011015353724360466, "rewards/KL_reward/std": 0.008712667971849442, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": 0.008286483585834503, "rewards/angle_reward/std": 0.7040897011756897, "rewards/thinking_verbosity_reward/mean": -1.2278902530670166, "rewards/thinking_verbosity_reward/std": 0.30847659707069397, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 568.109375, "epoch": 0.3968253968253968, "grad_norm": 0.021108614280819893, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 42688471.0, "rewards/KL_reward/mean": -0.013645791448652744, "rewards/KL_reward/std": 0.011249223724007607, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": 0.05859003961086273, "rewards/angle_reward/std": 0.7381419539451599, "rewards/thinking_verbosity_reward/mean": -1.1669996976852417, "rewards/thinking_verbosity_reward/std": 0.23741234838962555, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 553.9765625, "epoch": 0.3979591836734694, "grad_norm": 0.020242752507328987, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 42791812.0, "rewards/KL_reward/mean": -0.013703307136893272, "rewards/KL_reward/std": 0.010288462042808533, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.01580832153558731, "rewards/angle_reward/std": 0.7156597971916199, "rewards/thinking_verbosity_reward/mean": -1.15157949924469, "rewards/thinking_verbosity_reward/std": 0.23826247453689575, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 574.71875, "epoch": 0.39909297052154197, "grad_norm": 0.021016456186771393, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 42897768.0, "rewards/KL_reward/mean": -0.011165348812937737, "rewards/KL_reward/std": 0.0091776167973876, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45510825514793396, "rewards/angle_reward/mean": -0.014624292030930519, "rewards/angle_reward/std": 0.6658822894096375, "rewards/thinking_verbosity_reward/mean": -1.1669721603393555, "rewards/thinking_verbosity_reward/std": 0.2702290713787079, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 604.578125, "epoch": 0.4002267573696145, "grad_norm": 0.01961846835911274, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 43007306.0, "rewards/KL_reward/mean": -0.01148420199751854, "rewards/KL_reward/std": 0.010005916468799114, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.03877856582403183, "rewards/angle_reward/std": 0.6896043419837952, "rewards/thinking_verbosity_reward/mean": -1.1999263763427734, "rewards/thinking_verbosity_reward/std": 0.26386937499046326, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 608.1640625, "epoch": 0.4013605442176871, "grad_norm": 0.017984895035624504, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 43117167.0, "rewards/KL_reward/mean": -0.012511475943028927, "rewards/KL_reward/std": 0.008827902376651764, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": 0.01137047354131937, "rewards/angle_reward/std": 0.7264191508293152, "rewards/thinking_verbosity_reward/mean": -1.2140936851501465, "rewards/thinking_verbosity_reward/std": 0.21022173762321472, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 596.3125, "epoch": 0.40249433106575966, "grad_norm": 0.01846635527908802, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 43225463.0, "rewards/KL_reward/mean": -0.011512024328112602, "rewards/KL_reward/std": 0.009280053898692131, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.049537211656570435, "rewards/angle_reward/std": 0.6839423179626465, "rewards/thinking_verbosity_reward/mean": -1.201066017150879, "rewards/thinking_verbosity_reward/std": 0.2146666944026947, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 669.7578125, "epoch": 0.4036281179138322, "grad_norm": 0.022069066762924194, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 43343104.0, "rewards/KL_reward/mean": -0.013993291184306145, "rewards/KL_reward/std": 0.012002113275229931, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": -0.0638950914144516, "rewards/angle_reward/std": 0.7101727724075317, "rewards/thinking_verbosity_reward/mean": -1.2499220371246338, "rewards/thinking_verbosity_reward/std": 0.33237338066101074, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 585.03125, "epoch": 0.40476190476190477, "grad_norm": 0.020424002781510353, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 43449764.0, "rewards/KL_reward/mean": -0.012592829763889313, "rewards/KL_reward/std": 0.009561818093061447, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": 0.042757753282785416, "rewards/angle_reward/std": 0.7115549445152283, "rewards/thinking_verbosity_reward/mean": -1.182867407798767, "rewards/thinking_verbosity_reward/std": 0.2477547973394394, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 574.359375, "epoch": 0.40589569160997735, "grad_norm": 0.0195750929415226, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 43554914.0, "rewards/KL_reward/mean": -0.01165720634162426, "rewards/KL_reward/std": 0.008440484292805195, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.03374239802360535, "rewards/angle_reward/std": 0.6871976256370544, "rewards/thinking_verbosity_reward/mean": -1.1825599670410156, "rewards/thinking_verbosity_reward/std": 0.18763259053230286, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 540.2109375, "epoch": 0.4070294784580499, "grad_norm": 0.024201730266213417, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 43655309.0, "rewards/KL_reward/mean": -0.013428442180156708, "rewards/KL_reward/std": 0.009504460729658604, "rewards/accuracy_reward/mean": 0.21875, "rewards/accuracy_reward/std": 0.41502299904823303, "rewards/angle_reward/mean": -0.03815825283527374, "rewards/angle_reward/std": 0.7391625642776489, "rewards/thinking_verbosity_reward/mean": -1.1418852806091309, "rewards/thinking_verbosity_reward/std": 0.21088513731956482, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 513.296875, "epoch": 0.40816326530612246, "grad_norm": 0.018849622458219528, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 43753107.0, "rewards/KL_reward/mean": -0.010304594412446022, "rewards/KL_reward/std": 0.007997327484190464, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": 0.05903025344014168, "rewards/angle_reward/std": 0.7214637398719788, "rewards/thinking_verbosity_reward/mean": -1.1183676719665527, "rewards/thinking_verbosity_reward/std": 0.17387330532073975, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 622.234375, "epoch": 0.409297052154195, "grad_norm": 0.014960212633013725, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 43864649.0, "rewards/KL_reward/mean": -0.011811970733106136, "rewards/KL_reward/std": 0.009086758829653263, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": 0.01255982369184494, "rewards/angle_reward/std": 0.6803118586540222, "rewards/thinking_verbosity_reward/mean": -1.2231049537658691, "rewards/thinking_verbosity_reward/std": 0.239854633808136, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 600.4765625, "epoch": 0.41043083900226757, "grad_norm": 0.019353918731212616, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 43973806.0, "rewards/KL_reward/mean": -0.012659039348363876, "rewards/KL_reward/std": 0.010857551358640194, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.044028282165527344, "rewards/angle_reward/std": 0.7046893239021301, "rewards/thinking_verbosity_reward/mean": -1.204061508178711, "rewards/thinking_verbosity_reward/std": 0.22202034294605255, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 556.4140625, "epoch": 0.41156462585034015, "grad_norm": 0.01892830803990364, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 44077067.0, "rewards/KL_reward/mean": -0.013601994141936302, "rewards/KL_reward/std": 0.01063512172549963, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44340085983276367, "rewards/angle_reward/mean": 0.010024391114711761, "rewards/angle_reward/std": 0.7016712427139282, "rewards/thinking_verbosity_reward/mean": -1.1596899032592773, "rewards/thinking_verbosity_reward/std": 0.2097102701663971, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 555.5546875, "epoch": 0.4126984126984127, "grad_norm": 0.02198626659810543, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 44180226.0, "rewards/KL_reward/mean": -0.015893777832388878, "rewards/KL_reward/std": 0.01015418116003275, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/angle_reward/mean": -0.002530481666326523, "rewards/angle_reward/std": 0.6922826766967773, "rewards/thinking_verbosity_reward/mean": -1.144598126411438, "rewards/thinking_verbosity_reward/std": 0.27734825015068054, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 567.671875, "epoch": 0.41383219954648526, "grad_norm": 0.021529097110033035, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 44284824.0, "rewards/KL_reward/mean": -0.0151272714138031, "rewards/KL_reward/std": 0.012720032595098019, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.036351703107357025, "rewards/angle_reward/std": 0.7288860082626343, "rewards/thinking_verbosity_reward/mean": -1.1615468263626099, "rewards/thinking_verbosity_reward/std": 0.260768860578537, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 528.59375, "epoch": 0.41496598639455784, "grad_norm": 0.020824376493692398, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 44383988.0, "rewards/KL_reward/mean": -0.013930471614003181, "rewards/KL_reward/std": 0.01095433160662651, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.46208351850509644, "rewards/angle_reward/mean": -0.005772540345788002, "rewards/angle_reward/std": 0.6870887875556946, "rewards/thinking_verbosity_reward/mean": -1.1287271976470947, "rewards/thinking_verbosity_reward/std": 0.2128688246011734, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 526.5546875, "epoch": 0.41609977324263037, "grad_norm": 0.02206384763121605, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 44483259.0, "rewards/KL_reward/mean": -0.016933314502239227, "rewards/KL_reward/std": 0.012821149080991745, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.03346161171793938, "rewards/angle_reward/std": 0.711763322353363, "rewards/thinking_verbosity_reward/mean": -1.1266493797302246, "rewards/thinking_verbosity_reward/std": 0.21184676885604858, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 602.3046875, "epoch": 0.41723356009070295, "grad_norm": 0.018928341567516327, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 44592506.0, "rewards/KL_reward/mean": -0.01630426198244095, "rewards/KL_reward/std": 0.010932328179478645, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.194504976272583, "rewards/angle_reward/mean": -0.0600726418197155, "rewards/angle_reward/std": 0.7038162350654602, "rewards/thinking_verbosity_reward/mean": -1.2096517086029053, "rewards/thinking_verbosity_reward/std": 0.2007477581501007, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 612.390625, "epoch": 0.41836734693877553, "grad_norm": 0.020504910498857498, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 44702980.0, "rewards/KL_reward/mean": -0.011769358068704605, "rewards/KL_reward/std": 0.00859046634286642, "rewards/accuracy_reward/mean": 0.2421875, "rewards/accuracy_reward/std": 0.4300905168056488, "rewards/angle_reward/mean": 0.09859583526849747, "rewards/angle_reward/std": 0.7153234481811523, "rewards/thinking_verbosity_reward/mean": -1.204113245010376, "rewards/thinking_verbosity_reward/std": 0.28140151500701904, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 618.8828125, "epoch": 0.41950113378684806, "grad_norm": 0.019282154738903046, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 44814261.0, "rewards/KL_reward/mean": -0.01377407368272543, "rewards/KL_reward/std": 0.011624906212091446, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.00703779049217701, "rewards/angle_reward/std": 0.6897515058517456, "rewards/thinking_verbosity_reward/mean": -1.2140960693359375, "rewards/thinking_verbosity_reward/std": 0.26689741015434265, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 604.5859375, "epoch": 0.42063492063492064, "grad_norm": 0.018426265567541122, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 44923256.0, "rewards/KL_reward/mean": -0.011635626666247845, "rewards/KL_reward/std": 0.007985219359397888, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": -0.03205763176083565, "rewards/angle_reward/std": 0.7076945900917053, "rewards/thinking_verbosity_reward/mean": -1.206263542175293, "rewards/thinking_verbosity_reward/std": 0.23306098580360413, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 541.3125, "epoch": 0.4217687074829932, "grad_norm": 0.021884731948375702, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 45024336.0, "rewards/KL_reward/mean": -0.016247743740677834, "rewards/KL_reward/std": 0.011305822059512138, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.05057627707719803, "rewards/angle_reward/std": 0.6928049325942993, "rewards/thinking_verbosity_reward/mean": -1.1414589881896973, "rewards/thinking_verbosity_reward/std": 0.21956755220890045, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 650.328125, "epoch": 0.42290249433106575, "grad_norm": 0.016121037304401398, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 45139514.0, "rewards/KL_reward/mean": -0.013573609292507172, "rewards/KL_reward/std": 0.01031778659671545, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.029925987124443054, "rewards/angle_reward/std": 0.7024592161178589, "rewards/thinking_verbosity_reward/mean": -1.2470903396606445, "rewards/thinking_verbosity_reward/std": 0.26195836067199707, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 574.046875, "epoch": 0.42403628117913833, "grad_norm": 0.026229392737150192, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 45245304.0, "rewards/KL_reward/mean": -0.01583711802959442, "rewards/KL_reward/std": 0.011048772372305393, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.08974478393793106, "rewards/angle_reward/std": 0.7353434562683105, "rewards/thinking_verbosity_reward/mean": -1.1740953922271729, "rewards/thinking_verbosity_reward/std": 0.2335452437400818, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 621.7421875, "epoch": 0.42517006802721086, "grad_norm": 0.02271593175828457, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 45356975.0, "rewards/KL_reward/mean": -0.017855174839496613, "rewards/KL_reward/std": 0.013386182487010956, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.06267501413822174, "rewards/angle_reward/std": 0.7156962752342224, "rewards/thinking_verbosity_reward/mean": -1.206615686416626, "rewards/thinking_verbosity_reward/std": 0.3108856976032257, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 539.0625, "epoch": 0.42630385487528344, "grad_norm": 0.022404877468943596, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 45458191.0, "rewards/KL_reward/mean": -0.017527278512716293, "rewards/KL_reward/std": 0.01198546402156353, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": 0.0038344012573361397, "rewards/angle_reward/std": 0.7199822664260864, "rewards/thinking_verbosity_reward/mean": -1.1370865106582642, "rewards/thinking_verbosity_reward/std": 0.2293068766593933, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 506.328125, "epoch": 0.427437641723356, "grad_norm": 0.021387537941336632, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 45555321.0, "rewards/KL_reward/mean": -0.019968044012784958, "rewards/KL_reward/std": 0.013661942444741726, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.01847529038786888, "rewards/angle_reward/std": 0.7076072096824646, "rewards/thinking_verbosity_reward/mean": -1.1016631126403809, "rewards/thinking_verbosity_reward/std": 0.22371789813041687, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 556.765625, "epoch": 0.42857142857142855, "grad_norm": 0.021009325981140137, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 45657675.0, "rewards/KL_reward/mean": -0.016950562596321106, "rewards/KL_reward/std": 0.013891222886741161, "rewards/accuracy_reward/mean": 0.2734375, "rewards/accuracy_reward/std": 0.447474867105484, "rewards/angle_reward/mean": 0.06334151327610016, "rewards/angle_reward/std": 0.6955203413963318, "rewards/thinking_verbosity_reward/mean": -1.1447854042053223, "rewards/thinking_verbosity_reward/std": 0.28192561864852905, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 504.0703125, "epoch": 0.42970521541950113, "grad_norm": 0.025582019239664078, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 45753852.0, "rewards/KL_reward/mean": -0.018711145967245102, "rewards/KL_reward/std": 0.013790667988359928, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.112864650785923, "rewards/angle_reward/std": 0.7187625765800476, "rewards/thinking_verbosity_reward/mean": -1.0957250595092773, "rewards/thinking_verbosity_reward/std": 0.23977471888065338, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 583.546875, "epoch": 0.4308390022675737, "grad_norm": 0.019505100324749947, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 45860730.0, "rewards/KL_reward/mean": -0.0188748836517334, "rewards/KL_reward/std": 0.013765603303909302, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.05972357839345932, "rewards/angle_reward/std": 0.6861174702644348, "rewards/thinking_verbosity_reward/mean": -1.185563564300537, "rewards/thinking_verbosity_reward/std": 0.22624468803405762, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 699.2734375, "epoch": 0.43197278911564624, "grad_norm": 0.019939446821808815, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 45982389.0, "rewards/KL_reward/mean": -0.014938908629119396, "rewards/KL_reward/std": 0.011243962682783604, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": -0.10991691052913666, "rewards/angle_reward/std": 0.7018421292304993, "rewards/thinking_verbosity_reward/mean": -1.2750121355056763, "rewards/thinking_verbosity_reward/std": 0.34786784648895264, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 579.0234375, "epoch": 0.4331065759637188, "grad_norm": 0.022386744618415833, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 46088696.0, "rewards/KL_reward/mean": -0.016111835837364197, "rewards/KL_reward/std": 0.012026161886751652, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": 0.04101593792438507, "rewards/angle_reward/std": 0.7212110161781311, "rewards/thinking_verbosity_reward/mean": -1.1688454151153564, "rewards/thinking_verbosity_reward/std": 0.2818171977996826, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 463.7734375, "epoch": 0.4342403628117914, "grad_norm": 0.02676510065793991, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 46180715.0, "rewards/KL_reward/mean": -0.02379307523369789, "rewards/KL_reward/std": 0.014045041054487228, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22826264798641205, "rewards/angle_reward/mean": -0.05956241860985756, "rewards/angle_reward/std": 0.6922198534011841, "rewards/thinking_verbosity_reward/mean": -1.0482418537139893, "rewards/thinking_verbosity_reward/std": 0.2420331984758377, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 539.8125, "epoch": 0.43537414965986393, "grad_norm": 0.020799271762371063, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 46281923.0, "rewards/KL_reward/mean": -0.01960766687989235, "rewards/KL_reward/std": 0.01396810170263052, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.034598276019096375, "rewards/angle_reward/std": 0.6756842732429504, "rewards/thinking_verbosity_reward/mean": -1.1341664791107178, "rewards/thinking_verbosity_reward/std": 0.24729718267917633, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 635.9609375, "epoch": 0.4365079365079365, "grad_norm": 0.019486747682094574, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 46394798.0, "rewards/KL_reward/mean": -0.015526726841926575, "rewards/KL_reward/std": 0.01156558282673359, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.010235173627734184, "rewards/angle_reward/std": 0.7484339475631714, "rewards/thinking_verbosity_reward/mean": -1.2333378791809082, "rewards/thinking_verbosity_reward/std": 0.2584221661090851, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 525.90625, "epoch": 0.4376417233560091, "grad_norm": 0.023115647956728935, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 46493194.0, "rewards/KL_reward/mean": -0.02045394666492939, "rewards/KL_reward/std": 0.015044222585856915, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.005462062545120716, "rewards/angle_reward/std": 0.709326446056366, "rewards/thinking_verbosity_reward/mean": -1.1223537921905518, "rewards/thinking_verbosity_reward/std": 0.23022116720676422, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 555.9375, "epoch": 0.4387755102040816, "grad_norm": 0.02062627114355564, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 46596610.0, "rewards/KL_reward/mean": -0.02183781936764717, "rewards/KL_reward/std": 0.01346514094620943, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.194504976272583, "rewards/angle_reward/mean": 0.08628194779157639, "rewards/angle_reward/std": 0.6790797114372253, "rewards/thinking_verbosity_reward/mean": -1.1447288990020752, "rewards/thinking_verbosity_reward/std": 0.27846965193748474, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 644.078125, "epoch": 0.4399092970521542, "grad_norm": 0.01849028281867504, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 46711204.0, "rewards/KL_reward/mean": -0.014653094112873077, "rewards/KL_reward/std": 0.010434862226247787, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.004651002585887909, "rewards/angle_reward/std": 0.6873242855072021, "rewards/thinking_verbosity_reward/mean": -1.2303717136383057, "rewards/thinking_verbosity_reward/std": 0.30763575434684753, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 503.7421875, "epoch": 0.4410430839002268, "grad_norm": 0.025818170979619026, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 46807651.0, "rewards/KL_reward/mean": -0.020328128710389137, "rewards/KL_reward/std": 0.013701298274099827, "rewards/accuracy_reward/mean": 0.2109375, "rewards/accuracy_reward/std": 0.4095771610736847, "rewards/angle_reward/mean": -0.047695379704236984, "rewards/angle_reward/std": 0.695172905921936, "rewards/thinking_verbosity_reward/mean": -1.0927703380584717, "rewards/thinking_verbosity_reward/std": 0.25136125087738037, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 565.3203125, "epoch": 0.4421768707482993, "grad_norm": 0.022490637376904488, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 46911300.0, "rewards/KL_reward/mean": -0.026106202974915504, "rewards/KL_reward/std": 0.026732752099633217, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.03920355811715126, "rewards/angle_reward/std": 0.6827701926231384, "rewards/thinking_verbosity_reward/mean": -1.1563987731933594, "rewards/thinking_verbosity_reward/std": 0.2722534239292145, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 564.9296875, "epoch": 0.4433106575963719, "grad_norm": 0.021665828302502632, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 47015707.0, "rewards/KL_reward/mean": -0.019879762083292007, "rewards/KL_reward/std": 0.014529038220643997, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.03593013435602188, "rewards/angle_reward/std": 0.7235757112503052, "rewards/thinking_verbosity_reward/mean": -1.1605231761932373, "rewards/thinking_verbosity_reward/std": 0.2520044147968292, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 421.9375, "epoch": 0.4444444444444444, "grad_norm": 0.025612734258174896, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 47101067.0, "rewards/KL_reward/mean": -0.027379008010029793, "rewards/KL_reward/std": 0.014442806132137775, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.08324871957302094, "rewards/angle_reward/std": 0.6658397316932678, "rewards/thinking_verbosity_reward/mean": -1.0087127685546875, "rewards/thinking_verbosity_reward/std": 0.18739484250545502, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 507.890625, "epoch": 0.445578231292517, "grad_norm": 0.024070149287581444, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 47198117.0, "rewards/KL_reward/mean": -0.020547538995742798, "rewards/KL_reward/std": 0.014126886613667011, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": -0.005580000579357147, "rewards/angle_reward/std": 0.7161521911621094, "rewards/thinking_verbosity_reward/mean": -1.1085846424102783, "rewards/thinking_verbosity_reward/std": 0.19633635878562927, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 480.65625, "epoch": 0.4467120181405896, "grad_norm": 0.021039605140686035, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 47291625.0, "rewards/KL_reward/mean": -0.021802667528390884, "rewards/KL_reward/std": 0.01594499498605728, "rewards/accuracy_reward/mean": 0.265625, "rewards/accuracy_reward/std": 0.44340085983276367, "rewards/angle_reward/mean": 0.029494380578398705, "rewards/angle_reward/std": 0.7037698030471802, "rewards/thinking_verbosity_reward/mean": -1.073141098022461, "rewards/thinking_verbosity_reward/std": 0.2188214212656021, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 550.5, "epoch": 0.4478458049886621, "grad_norm": 0.02214093878865242, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 47393793.0, "rewards/KL_reward/mean": -0.021151121705770493, "rewards/KL_reward/std": 0.014054550789296627, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45510825514793396, "rewards/angle_reward/mean": 0.009490861557424068, "rewards/angle_reward/std": 0.6957032680511475, "rewards/thinking_verbosity_reward/mean": -1.1497143507003784, "rewards/thinking_verbosity_reward/std": 0.22868293523788452, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 596.8984375, "epoch": 0.4489795918367347, "grad_norm": 0.02260676957666874, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 47501172.0, "rewards/KL_reward/mean": -0.021430671215057373, "rewards/KL_reward/std": 0.01571296900510788, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40390563011169434, "rewards/angle_reward/mean": -0.0022510290145874023, "rewards/angle_reward/std": 0.6822193264961243, "rewards/thinking_verbosity_reward/mean": -1.1782336235046387, "rewards/thinking_verbosity_reward/std": 0.3198300004005432, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 566.7890625, "epoch": 0.4501133786848073, "grad_norm": 0.020635398104786873, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 47605857.0, "rewards/KL_reward/mean": -0.02311992645263672, "rewards/KL_reward/std": 0.01627832092344761, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": 0.0059994058683514595, "rewards/angle_reward/std": 0.7020658850669861, "rewards/thinking_verbosity_reward/mean": -1.1678216457366943, "rewards/thinking_verbosity_reward/std": 0.22597408294677734, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 532.828125, "epoch": 0.4512471655328798, "grad_norm": 0.02402258850634098, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 47706115.0, "rewards/KL_reward/mean": -0.02359929494559765, "rewards/KL_reward/std": 0.01530496682971716, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.03661785274744034, "rewards/angle_reward/std": 0.7108094096183777, "rewards/thinking_verbosity_reward/mean": -1.1365866661071777, "rewards/thinking_verbosity_reward/std": 0.19503378868103027, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 635.484375, "epoch": 0.4523809523809524, "grad_norm": 0.02138921432197094, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 47818993.0, "rewards/KL_reward/mean": -0.02280566468834877, "rewards/KL_reward/std": 0.015408539213240147, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.01748541370034218, "rewards/angle_reward/std": 0.6938936710357666, "rewards/thinking_verbosity_reward/mean": -1.2105220556259155, "rewards/thinking_verbosity_reward/std": 0.34896862506866455, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 522.609375, "epoch": 0.45351473922902497, "grad_norm": 0.023752223700284958, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 47917839.0, "rewards/KL_reward/mean": -0.0232955701649189, "rewards/KL_reward/std": 0.01624547690153122, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.0803392305970192, "rewards/angle_reward/std": 0.7035095691680908, "rewards/thinking_verbosity_reward/mean": -1.1072452068328857, "rewards/thinking_verbosity_reward/std": 0.28043869137763977, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 535.421875, "epoch": 0.4546485260770975, "grad_norm": 0.022278867661952972, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 48018445.0, "rewards/KL_reward/mean": -0.023161131888628006, "rewards/KL_reward/std": 0.01387379877269268, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": 0.0032333284616470337, "rewards/angle_reward/std": 0.7280095219612122, "rewards/thinking_verbosity_reward/mean": -1.1364167928695679, "rewards/thinking_verbosity_reward/std": 0.21204441785812378, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 492.0546875, "epoch": 0.4557823129251701, "grad_norm": 0.022933771833777428, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 48113244.0, "rewards/KL_reward/mean": -0.02552301064133644, "rewards/KL_reward/std": 0.01558267418295145, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.08489933609962463, "rewards/angle_reward/std": 0.7113597393035889, "rewards/thinking_verbosity_reward/mean": -1.092618703842163, "rewards/thinking_verbosity_reward/std": 0.1845749169588089, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 473.4375, "epoch": 0.45691609977324266, "grad_norm": 0.02334679663181305, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 48205676.0, "rewards/KL_reward/mean": -0.02721046656370163, "rewards/KL_reward/std": 0.01823466457426548, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.02178448811173439, "rewards/angle_reward/std": 0.6964385509490967, "rewards/thinking_verbosity_reward/mean": -1.0612900257110596, "rewards/thinking_verbosity_reward/std": 0.2350475788116455, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 470.71875, "epoch": 0.4580498866213152, "grad_norm": 0.024207444861531258, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 48298328.0, "rewards/KL_reward/mean": -0.026227515190839767, "rewards/KL_reward/std": 0.01750863343477249, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": 0.08088646084070206, "rewards/angle_reward/std": 0.6970916390419006, "rewards/thinking_verbosity_reward/mean": -1.0650391578674316, "rewards/thinking_verbosity_reward/std": 0.20070774853229523, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 497.03125, "epoch": 0.45918367346938777, "grad_norm": 0.026575271040201187, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 48394196.0, "rewards/KL_reward/mean": -0.02246716618537903, "rewards/KL_reward/std": 0.01457224227488041, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.10408850014209747, "rewards/angle_reward/std": 0.7194622755050659, "rewards/thinking_verbosity_reward/mean": -1.0898295640945435, "rewards/thinking_verbosity_reward/std": 0.2296569049358368, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 560.3984375, "epoch": 0.4603174603174603, "grad_norm": 0.02821190282702446, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 48498383.0, "rewards/KL_reward/mean": -0.02293389104306698, "rewards/KL_reward/std": 0.016351953148841858, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.11334050446748734, "rewards/angle_reward/std": 0.7183730602264404, "rewards/thinking_verbosity_reward/mean": -1.1596062183380127, "rewards/thinking_verbosity_reward/std": 0.2328803390264511, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 542.1796875, "epoch": 0.4614512471655329, "grad_norm": 0.022647786885499954, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 48599854.0, "rewards/KL_reward/mean": -0.023735279217362404, "rewards/KL_reward/std": 0.015727022662758827, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": 0.006649286486208439, "rewards/angle_reward/std": 0.7050229907035828, "rewards/thinking_verbosity_reward/mean": -1.139451026916504, "rewards/thinking_verbosity_reward/std": 0.23458538949489594, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 471.328125, "epoch": 0.46258503401360546, "grad_norm": 0.02578257955610752, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 48692136.0, "rewards/KL_reward/mean": -0.025398539379239082, "rewards/KL_reward/std": 0.016654588282108307, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.019923239946365356, "rewards/angle_reward/std": 0.7124901413917542, "rewards/thinking_verbosity_reward/mean": -1.0590758323669434, "rewards/thinking_verbosity_reward/std": 0.23363560438156128, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 604.5546875, "epoch": 0.463718820861678, "grad_norm": 0.022921523079276085, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 48801967.0, "rewards/KL_reward/mean": -0.023579616099596024, "rewards/KL_reward/std": 0.0175795741379261, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.05431587994098663, "rewards/angle_reward/std": 0.6877104043960571, "rewards/thinking_verbosity_reward/mean": -1.1805675029754639, "rewards/thinking_verbosity_reward/std": 0.340753436088562, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 494.09375, "epoch": 0.46485260770975056, "grad_norm": 0.024589331820607185, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 48896779.0, "rewards/KL_reward/mean": -0.027412384748458862, "rewards/KL_reward/std": 0.016354065388441086, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.05277753621339798, "rewards/angle_reward/std": 0.6910402774810791, "rewards/thinking_verbosity_reward/mean": -1.088793158531189, "rewards/thinking_verbosity_reward/std": 0.21807634830474854, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 526.328125, "epoch": 0.46598639455782315, "grad_norm": 0.021234115585684776, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 48995669.0, "rewards/KL_reward/mean": -0.03023386560380459, "rewards/KL_reward/std": 0.021569764241576195, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.04875587671995163, "rewards/angle_reward/std": 0.6895540356636047, "rewards/thinking_verbosity_reward/mean": -1.1237661838531494, "rewards/thinking_verbosity_reward/std": 0.2254941314458847, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 536.046875, "epoch": 0.4671201814058957, "grad_norm": 0.0258078221231699, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 49096083.0, "rewards/KL_reward/mean": -0.02499195747077465, "rewards/KL_reward/std": 0.01609044522047043, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": 0.021279610693454742, "rewards/angle_reward/std": 0.7131556272506714, "rewards/thinking_verbosity_reward/mean": -1.123023271560669, "rewards/thinking_verbosity_reward/std": 0.27755653858184814, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 476.859375, "epoch": 0.46825396825396826, "grad_norm": 0.03131388872861862, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 49189033.0, "rewards/KL_reward/mean": -0.028134461492300034, "rewards/KL_reward/std": 0.018374817445874214, "rewards/accuracy_reward/mean": 0.2890625, "rewards/accuracy_reward/std": 0.45510825514793396, "rewards/angle_reward/mean": -0.025143563747406006, "rewards/angle_reward/std": 0.7260422706604004, "rewards/thinking_verbosity_reward/mean": -1.0641911029815674, "rewards/thinking_verbosity_reward/std": 0.23999130725860596, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 473.953125, "epoch": 0.46938775510204084, "grad_norm": 0.023590773344039917, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 49281419.0, "rewards/KL_reward/mean": -0.02742850035429001, "rewards/KL_reward/std": 0.02026979625225067, "rewards/accuracy_reward/mean": 0.3203125, "rewards/accuracy_reward/std": 0.4684300124645233, "rewards/angle_reward/mean": 0.03548327833414078, "rewards/angle_reward/std": 0.7357466816902161, "rewards/thinking_verbosity_reward/mean": -1.0584043264389038, "rewards/thinking_verbosity_reward/std": 0.2503049075603485, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 512.1484375, "epoch": 0.47052154195011336, "grad_norm": 0.020758090540766716, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 49378894.0, "rewards/KL_reward/mean": -0.027455642819404602, "rewards/KL_reward/std": 0.018511703237891197, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": -0.028938554227352142, "rewards/angle_reward/std": 0.7072205543518066, "rewards/thinking_verbosity_reward/mean": -1.1127849817276, "rewards/thinking_verbosity_reward/std": 0.19933627545833588, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 490.8671875, "epoch": 0.47165532879818595, "grad_norm": 0.023538529872894287, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 49473717.0, "rewards/KL_reward/mean": -0.03473803400993347, "rewards/KL_reward/std": 0.02424466982483864, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": -0.07885561883449554, "rewards/angle_reward/std": 0.6736251711845398, "rewards/thinking_verbosity_reward/mean": -1.0767022371292114, "rewards/thinking_verbosity_reward/std": 0.2567003071308136, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 562.8984375, "epoch": 0.47278911564625853, "grad_norm": 0.02368626557290554, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 49577232.0, "rewards/KL_reward/mean": -0.028153687715530396, "rewards/KL_reward/std": 0.017911860719323158, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3787541687488556, "rewards/angle_reward/mean": -0.01273367926478386, "rewards/angle_reward/std": 0.705632209777832, "rewards/thinking_verbosity_reward/mean": -1.150090217590332, "rewards/thinking_verbosity_reward/std": 0.28758352994918823, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 466.703125, "epoch": 0.47392290249433106, "grad_norm": 0.02429896593093872, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 49668962.0, "rewards/KL_reward/mean": -0.031044352799654007, "rewards/KL_reward/std": 0.01687506027519703, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.05802838131785393, "rewards/angle_reward/std": 0.6904194951057434, "rewards/thinking_verbosity_reward/mean": -1.0635954141616821, "rewards/thinking_verbosity_reward/std": 0.18243525922298431, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 456.890625, "epoch": 0.47505668934240364, "grad_norm": 0.028251731768250465, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 49758980.0, "rewards/KL_reward/mean": -0.03525323420763016, "rewards/KL_reward/std": 0.02170429937541485, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.46208351850509644, "rewards/angle_reward/mean": 0.05117577686905861, "rewards/angle_reward/std": 0.7218906879425049, "rewards/thinking_verbosity_reward/mean": -1.0450398921966553, "rewards/thinking_verbosity_reward/std": 0.2190735787153244, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 400.375, "epoch": 0.47619047619047616, "grad_norm": 0.02744399756193161, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 49842364.0, "rewards/KL_reward/mean": -0.039281006902456284, "rewards/KL_reward/std": 0.025512907654047012, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.04501733183860779, "rewards/angle_reward/std": 0.6914629936218262, "rewards/thinking_verbosity_reward/mean": -0.9827430248260498, "rewards/thinking_verbosity_reward/std": 0.1813047230243683, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 457.8671875, "epoch": 0.47732426303854875, "grad_norm": 0.024862121790647507, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 49932675.0, "rewards/KL_reward/mean": -0.033260006457567215, "rewards/KL_reward/std": 0.017655346542596817, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": 0.023770904168486595, "rewards/angle_reward/std": 0.7306255102157593, "rewards/thinking_verbosity_reward/mean": -1.0440349578857422, "rewards/thinking_verbosity_reward/std": 0.22932352125644684, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 471.09375, "epoch": 0.47845804988662133, "grad_norm": 0.029904166236519814, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 50024623.0, "rewards/KL_reward/mean": -0.03850427269935608, "rewards/KL_reward/std": 0.030644718557596207, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.013057619333267212, "rewards/angle_reward/std": 0.6803799271583557, "rewards/thinking_verbosity_reward/mean": -1.0541119575500488, "rewards/thinking_verbosity_reward/std": 0.25413978099823, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 451.3984375, "epoch": 0.47959183673469385, "grad_norm": 0.0292107742279768, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 50114562.0, "rewards/KL_reward/mean": -0.03692799434065819, "rewards/KL_reward/std": 0.021713055670261383, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -9.727664291858673e-05, "rewards/angle_reward/std": 0.7122227549552917, "rewards/thinking_verbosity_reward/mean": -1.0429333448410034, "rewards/thinking_verbosity_reward/std": 0.1964370459318161, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 513.953125, "epoch": 0.48072562358276644, "grad_norm": 0.02299739606678486, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 50212252.0, "rewards/KL_reward/mean": -0.0356745719909668, "rewards/KL_reward/std": 0.024989625439047813, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.040337808430194855, "rewards/angle_reward/std": 0.7102200388908386, "rewards/thinking_verbosity_reward/mean": -1.1041864156723022, "rewards/thinking_verbosity_reward/std": 0.25233370065689087, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 404.171875, "epoch": 0.481859410430839, "grad_norm": 0.024677474051713943, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 50295098.0, "rewards/KL_reward/mean": -0.039033617824316025, "rewards/KL_reward/std": 0.025141440331935883, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.04172428324818611, "rewards/angle_reward/std": 0.7007785439491272, "rewards/thinking_verbosity_reward/mean": -0.9804962277412415, "rewards/thinking_verbosity_reward/std": 0.21661804616451263, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 393.8046875, "epoch": 0.48299319727891155, "grad_norm": 0.02994668297469616, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 50377305.0, "rewards/KL_reward/mean": -0.04187620431184769, "rewards/KL_reward/std": 0.027131345123052597, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.06471440196037292, "rewards/angle_reward/std": 0.6868351101875305, "rewards/thinking_verbosity_reward/mean": -0.96719890832901, "rewards/thinking_verbosity_reward/std": 0.21648328006267548, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 444.3515625, "epoch": 0.48412698412698413, "grad_norm": 0.025785937905311584, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 50466102.0, "rewards/KL_reward/mean": -0.0387788824737072, "rewards/KL_reward/std": 0.018455451354384422, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": 0.012625513598322868, "rewards/angle_reward/std": 0.693104088306427, "rewards/thinking_verbosity_reward/mean": -1.03594970703125, "rewards/thinking_verbosity_reward/std": 0.18826691806316376, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 509.0234375, "epoch": 0.4852607709750567, "grad_norm": 0.02384641021490097, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 50563505.0, "rewards/KL_reward/mean": -0.03446277976036072, "rewards/KL_reward/std": 0.02286764234304428, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.04777942970395088, "rewards/angle_reward/std": 0.6951702237129211, "rewards/thinking_verbosity_reward/mean": -1.0935171842575073, "rewards/thinking_verbosity_reward/std": 0.2736482620239258, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 389.0859375, "epoch": 0.48639455782312924, "grad_norm": 0.032591525465250015, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 50645268.0, "rewards/KL_reward/mean": -0.04487357661128044, "rewards/KL_reward/std": 0.025163918733596802, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": 0.011689020320773125, "rewards/angle_reward/std": 0.6959567070007324, "rewards/thinking_verbosity_reward/mean": -0.9662455916404724, "rewards/thinking_verbosity_reward/std": 0.192021906375885, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 416.7265625, "epoch": 0.4875283446712018, "grad_norm": 0.030018575489521027, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 50730665.0, "rewards/KL_reward/mean": -0.046098943799734116, "rewards/KL_reward/std": 0.026156146079301834, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.05590461194515228, "rewards/angle_reward/std": 0.6785492300987244, "rewards/thinking_verbosity_reward/mean": -0.9995837211608887, "rewards/thinking_verbosity_reward/std": 0.2010611891746521, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 479.203125, "epoch": 0.4886621315192744, "grad_norm": 0.022822268307209015, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 50824171.0, "rewards/KL_reward/mean": -0.0350266769528389, "rewards/KL_reward/std": 0.024317413568496704, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.02126418985426426, "rewards/angle_reward/std": 0.7227473258972168, "rewards/thinking_verbosity_reward/mean": -1.0733683109283447, "rewards/thinking_verbosity_reward/std": 0.2090698778629303, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 350.5859375, "epoch": 0.4897959183673469, "grad_norm": 0.03339194133877754, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 50901174.0, "rewards/KL_reward/mean": -0.04047293961048126, "rewards/KL_reward/std": 0.027730628848075867, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": -0.03062419593334198, "rewards/angle_reward/std": 0.686697244644165, "rewards/thinking_verbosity_reward/mean": -0.9178509712219238, "rewards/thinking_verbosity_reward/std": 0.1779995560646057, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 422.5546875, "epoch": 0.4909297052154195, "grad_norm": 0.028161540627479553, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 50987429.0, "rewards/KL_reward/mean": -0.04439615458250046, "rewards/KL_reward/std": 0.025982331484556198, "rewards/accuracy_reward/mean": 0.1015625, "rewards/accuracy_reward/std": 0.3032590448856354, "rewards/angle_reward/mean": -0.004183335229754448, "rewards/angle_reward/std": 0.7039921879768372, "rewards/thinking_verbosity_reward/mean": -1.012196660041809, "rewards/thinking_verbosity_reward/std": 0.17203310132026672, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 409.03125, "epoch": 0.49206349206349204, "grad_norm": 0.025642655789852142, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 51071617.0, "rewards/KL_reward/mean": -0.0451393648982048, "rewards/KL_reward/std": 0.026113361120224, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": 0.014498209580779076, "rewards/angle_reward/std": 0.6932234764099121, "rewards/thinking_verbosity_reward/mean": -0.9939348697662354, "rewards/thinking_verbosity_reward/std": 0.18006928265094757, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 411.46875, "epoch": 0.4931972789115646, "grad_norm": 0.028984270989894867, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 51156709.0, "rewards/KL_reward/mean": -0.04902590066194534, "rewards/KL_reward/std": 0.02902785688638687, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.001699242740869522, "rewards/angle_reward/std": 0.7301017045974731, "rewards/thinking_verbosity_reward/mean": -0.990402102470398, "rewards/thinking_verbosity_reward/std": 0.2136627733707428, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 421.859375, "epoch": 0.4943310657596372, "grad_norm": 0.02719872258603573, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 51242339.0, "rewards/KL_reward/mean": -0.04764976352453232, "rewards/KL_reward/std": 0.02956109680235386, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": -0.04444997012615204, "rewards/angle_reward/std": 0.682907223701477, "rewards/thinking_verbosity_reward/mean": -1.0076520442962646, "rewards/thinking_verbosity_reward/std": 0.19254931807518005, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 403.5546875, "epoch": 0.4954648526077097, "grad_norm": 0.02942391112446785, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 51326618.0, "rewards/KL_reward/mean": -0.052202943712472916, "rewards/KL_reward/std": 0.03445935621857643, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.0014247801154851913, "rewards/angle_reward/std": 0.695796012878418, "rewards/thinking_verbosity_reward/mean": -0.985343337059021, "rewards/thinking_verbosity_reward/std": 0.18911543488502502, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 356.40625, "epoch": 0.4965986394557823, "grad_norm": 0.03621532395482063, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 51404134.0, "rewards/KL_reward/mean": -0.05607658624649048, "rewards/KL_reward/std": 0.0350998118519783, "rewards/accuracy_reward/mean": 0.1796875, "rewards/accuracy_reward/std": 0.3854354918003082, "rewards/angle_reward/mean": -0.01746700517833233, "rewards/angle_reward/std": 0.7109165191650391, "rewards/thinking_verbosity_reward/mean": -0.9277302622795105, "rewards/thinking_verbosity_reward/std": 0.16754429042339325, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 392.90625, "epoch": 0.4977324263038549, "grad_norm": 0.03103005886077881, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 51486514.0, "rewards/KL_reward/mean": -0.057342153042554855, "rewards/KL_reward/std": 0.03963745757937431, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": -0.06879903376102448, "rewards/angle_reward/std": 0.6926581263542175, "rewards/thinking_verbosity_reward/mean": -0.9679933190345764, "rewards/thinking_verbosity_reward/std": 0.207347109913826, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 392.453125, "epoch": 0.4988662131519274, "grad_norm": 0.03392069414258003, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 51568908.0, "rewards/KL_reward/mean": -0.05464167892932892, "rewards/KL_reward/std": 0.024992190301418304, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.047817666083574295, "rewards/angle_reward/std": 0.6898568272590637, "rewards/thinking_verbosity_reward/mean": -0.9665283560752869, "rewards/thinking_verbosity_reward/std": 0.2116737812757492, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 396.09375, "epoch": 0.5, "grad_norm": 0.03598444536328316, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 51650688.0, "rewards/KL_reward/mean": -0.061555199325084686, "rewards/KL_reward/std": 0.03372234106063843, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.006012503057718277, "rewards/angle_reward/std": 0.699260413646698, "rewards/thinking_verbosity_reward/mean": -0.9670593738555908, "rewards/thinking_verbosity_reward/std": 0.22992496192455292, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 382.9296875, "epoch": 0.5011337868480725, "grad_norm": 0.02516038529574871, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 51731551.0, "rewards/KL_reward/mean": -0.06171301752328873, "rewards/KL_reward/std": 0.036923423409461975, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.024304382503032684, "rewards/angle_reward/std": 0.6884368062019348, "rewards/thinking_verbosity_reward/mean": -0.95870041847229, "rewards/thinking_verbosity_reward/std": 0.18973399698734283, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 399.3359375, "epoch": 0.5022675736961452, "grad_norm": 0.030778488144278526, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 51815138.0, "rewards/KL_reward/mean": -0.0707942545413971, "rewards/KL_reward/std": 0.04487849026918411, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.194504976272583, "rewards/angle_reward/mean": -0.0013290448114275932, "rewards/angle_reward/std": 0.6961056590080261, "rewards/thinking_verbosity_reward/mean": -0.9707742929458618, "rewards/thinking_verbosity_reward/std": 0.23211538791656494, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 403.15625, "epoch": 0.5034013605442177, "grad_norm": 0.034018371254205704, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 51898326.0, "rewards/KL_reward/mean": -0.064726322889328, "rewards/KL_reward/std": 0.03655809909105301, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": 0.035316262394189835, "rewards/angle_reward/std": 0.6933196783065796, "rewards/thinking_verbosity_reward/mean": -0.9698935747146606, "rewards/thinking_verbosity_reward/std": 0.25535571575164795, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 342.8046875, "epoch": 0.5045351473922902, "grad_norm": 0.03644280880689621, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 51974157.0, "rewards/KL_reward/mean": -0.07590562105178833, "rewards/KL_reward/std": 0.03883231058716774, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": 0.008078407496213913, "rewards/angle_reward/std": 0.7256864905357361, "rewards/thinking_verbosity_reward/mean": -0.9066504240036011, "rewards/thinking_verbosity_reward/std": 0.18092256784439087, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 354.25, "epoch": 0.5056689342403629, "grad_norm": 0.034074749797582626, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 52051261.0, "rewards/KL_reward/mean": -0.06324449926614761, "rewards/KL_reward/std": 0.028212472796440125, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.0013993959873914719, "rewards/angle_reward/std": 0.7149010896682739, "rewards/thinking_verbosity_reward/mean": -0.9295545220375061, "rewards/thinking_verbosity_reward/std": 0.13850507140159607, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 381.3984375, "epoch": 0.5068027210884354, "grad_norm": 0.0305367149412632, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 52131848.0, "rewards/KL_reward/mean": -0.06172921508550644, "rewards/KL_reward/std": 0.04027363657951355, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42527204751968384, "rewards/angle_reward/mean": -0.05292728170752525, "rewards/angle_reward/std": 0.6935394406318665, "rewards/thinking_verbosity_reward/mean": -0.9622151851654053, "rewards/thinking_verbosity_reward/std": 0.15917249023914337, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 367.2734375, "epoch": 0.5079365079365079, "grad_norm": 0.03114541992545128, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 52210963.0, "rewards/KL_reward/mean": -0.06039270758628845, "rewards/KL_reward/std": 0.03316747024655342, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": -0.024122249335050583, "rewards/angle_reward/std": 0.7034289836883545, "rewards/thinking_verbosity_reward/mean": -0.9365625977516174, "rewards/thinking_verbosity_reward/std": 0.19702234864234924, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 332.9921875, "epoch": 0.5090702947845805, "grad_norm": 0.030634764581918716, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 52285426.0, "rewards/KL_reward/mean": -0.08006307482719421, "rewards/KL_reward/std": 0.049913886934518814, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": -0.02648836001753807, "rewards/angle_reward/std": 0.6740854382514954, "rewards/thinking_verbosity_reward/mean": -0.8969378471374512, "rewards/thinking_verbosity_reward/std": 0.1602609008550644, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 388.625, "epoch": 0.5102040816326531, "grad_norm": 0.03345860168337822, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 52366106.0, "rewards/KL_reward/mean": -0.07310838997364044, "rewards/KL_reward/std": 0.037350211292505264, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.10642776638269424, "rewards/angle_reward/std": 0.6874831914901733, "rewards/thinking_verbosity_reward/mean": -0.959571361541748, "rewards/thinking_verbosity_reward/std": 0.22060275077819824, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 386.3125, "epoch": 0.5113378684807256, "grad_norm": 0.03417220711708069, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 52447378.0, "rewards/KL_reward/mean": -0.06972689926624298, "rewards/KL_reward/std": 0.038145698606967926, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.3979988098144531, "rewards/angle_reward/mean": 0.06792110949754715, "rewards/angle_reward/std": 0.7083807587623596, "rewards/thinking_verbosity_reward/mean": -0.9497437477111816, "rewards/thinking_verbosity_reward/std": 0.2483774721622467, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 357.0703125, "epoch": 0.5124716553287982, "grad_norm": 0.03606312349438667, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 52525219.0, "rewards/KL_reward/mean": -0.09044323861598969, "rewards/KL_reward/std": 0.057571690529584885, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.434714138507843, "rewards/angle_reward/mean": -0.01749390922486782, "rewards/angle_reward/std": 0.7063094973564148, "rewards/thinking_verbosity_reward/mean": -0.9207316040992737, "rewards/thinking_verbosity_reward/std": 0.20679256319999695, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 316.4140625, "epoch": 0.5136054421768708, "grad_norm": 0.038564532995224, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 52598072.0, "rewards/KL_reward/mean": -0.09466668218374252, "rewards/KL_reward/std": 0.04971272870898247, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31333550810813904, "rewards/angle_reward/mean": -0.06022874265909195, "rewards/angle_reward/std": 0.7210524082183838, "rewards/thinking_verbosity_reward/mean": -0.8652991056442261, "rewards/thinking_verbosity_reward/std": 0.20021560788154602, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 372.609375, "epoch": 0.5147392290249433, "grad_norm": 0.035059019923210144, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 52677518.0, "rewards/KL_reward/mean": -0.08334328979253769, "rewards/KL_reward/std": 0.0473569817841053, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3645188808441162, "rewards/angle_reward/mean": -0.015776190906763077, "rewards/angle_reward/std": 0.6482148766517639, "rewards/thinking_verbosity_reward/mean": -0.937674880027771, "rewards/thinking_verbosity_reward/std": 0.22401179373264313, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 391.765625, "epoch": 0.5158730158730159, "grad_norm": 0.031195441260933876, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 52759248.0, "rewards/KL_reward/mean": -0.07809939235448837, "rewards/KL_reward/std": 0.04455532878637314, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": 0.008754374459385872, "rewards/angle_reward/std": 0.6928679347038269, "rewards/thinking_verbosity_reward/mean": -0.9701083898544312, "rewards/thinking_verbosity_reward/std": 0.18991024792194366, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 310.3671875, "epoch": 0.5170068027210885, "grad_norm": 0.04037066176533699, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 52830615.0, "rewards/KL_reward/mean": -0.10021911561489105, "rewards/KL_reward/std": 0.05514080449938774, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3490002751350403, "rewards/angle_reward/mean": -0.06877849251031876, "rewards/angle_reward/std": 0.6966662406921387, "rewards/thinking_verbosity_reward/mean": -0.8576784133911133, "rewards/thinking_verbosity_reward/std": 0.19515058398246765, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 298.53125, "epoch": 0.518140589569161, "grad_norm": 0.03613923117518425, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 52900531.0, "rewards/KL_reward/mean": -0.09775243699550629, "rewards/KL_reward/std": 0.04872914031147957, "rewards/accuracy_reward/mean": 0.1640625, "rewards/accuracy_reward/std": 0.371787428855896, "rewards/angle_reward/mean": 0.009356520138680935, "rewards/angle_reward/std": 0.7003117799758911, "rewards/thinking_verbosity_reward/mean": -0.8445853590965271, "rewards/thinking_verbosity_reward/std": 0.1752832531929016, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 317.5703125, "epoch": 0.5192743764172335, "grad_norm": 0.036219146102666855, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 52973124.0, "rewards/KL_reward/mean": -0.1005428358912468, "rewards/KL_reward/std": 0.0575571209192276, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.03271003067493439, "rewards/angle_reward/std": 0.6863457560539246, "rewards/thinking_verbosity_reward/mean": -0.8697078227996826, "rewards/thinking_verbosity_reward/std": 0.18790937960147858, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 308.2890625, "epoch": 0.5204081632653061, "grad_norm": 0.034134335815906525, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53044521.0, "rewards/KL_reward/mean": -0.1032245084643364, "rewards/KL_reward/std": 0.057847797870635986, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.043558768928050995, "rewards/angle_reward/std": 0.6570540070533752, "rewards/thinking_verbosity_reward/mean": -0.8615978956222534, "rewards/thinking_verbosity_reward/std": 0.16141784191131592, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 346.296875, "epoch": 0.5215419501133787, "grad_norm": 0.03669775277376175, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53120535.0, "rewards/KL_reward/mean": -0.10197243839502335, "rewards/KL_reward/std": 0.06329569220542908, "rewards/accuracy_reward/mean": 0.1484375, "rewards/accuracy_reward/std": 0.356930136680603, "rewards/angle_reward/mean": -0.01836412027478218, "rewards/angle_reward/std": 0.7148515582084656, "rewards/thinking_verbosity_reward/mean": -0.8993373513221741, "rewards/thinking_verbosity_reward/std": 0.23422886431217194, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 255.59375, "epoch": 0.5226757369614512, "grad_norm": 0.04212069511413574, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53184771.0, "rewards/KL_reward/mean": -0.13540199398994446, "rewards/KL_reward/std": 0.06138134002685547, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.1746762990951538, "rewards/angle_reward/mean": -0.05444896221160889, "rewards/angle_reward/std": 0.7013072967529297, "rewards/thinking_verbosity_reward/mean": -0.7881078124046326, "rewards/thinking_verbosity_reward/std": 0.12446502596139908, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 282.8515625, "epoch": 0.5238095238095238, "grad_norm": 0.04676346480846405, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 53253496.0, "rewards/KL_reward/mean": -0.12668287754058838, "rewards/KL_reward/std": 0.0727287158370018, "rewards/accuracy_reward/mean": 0.09375, "rewards/accuracy_reward/std": 0.29262590408325195, "rewards/angle_reward/mean": -0.09203147888183594, "rewards/angle_reward/std": 0.6941730380058289, "rewards/thinking_verbosity_reward/mean": -0.8155484199523926, "rewards/thinking_verbosity_reward/std": 0.19955144822597504, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 267.4765625, "epoch": 0.5249433106575964, "grad_norm": 0.05062921345233917, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53319877.0, "rewards/KL_reward/mean": -0.1394931823015213, "rewards/KL_reward/std": 0.05866050720214844, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15188287198543549, "rewards/angle_reward/mean": -0.004266105592250824, "rewards/angle_reward/std": 0.7063484191894531, "rewards/thinking_verbosity_reward/mean": -0.8024481534957886, "rewards/thinking_verbosity_reward/std": 0.14974668622016907, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 270.5390625, "epoch": 0.5260770975056689, "grad_norm": 0.03620358929038048, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53386546.0, "rewards/KL_reward/mean": -0.1303442418575287, "rewards/KL_reward/std": 0.06146172061562538, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.3320184051990509, "rewards/angle_reward/mean": -0.04130447655916214, "rewards/angle_reward/std": 0.6710583567619324, "rewards/thinking_verbosity_reward/mean": -0.8068867921829224, "rewards/thinking_verbosity_reward/std": 0.15152832865715027, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 278.0625, "epoch": 0.5272108843537415, "grad_norm": 0.04413303732872009, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53454690.0, "rewards/KL_reward/mean": -0.15620750188827515, "rewards/KL_reward/std": 0.07933449745178223, "rewards/accuracy_reward/mean": 0.1875, "rewards/accuracy_reward/std": 0.39184603095054626, "rewards/angle_reward/mean": 0.01784469187259674, "rewards/angle_reward/std": 0.6926581859588623, "rewards/thinking_verbosity_reward/mean": -0.8130091428756714, "rewards/thinking_verbosity_reward/std": 0.17866672575473785, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 231.09375, "epoch": 0.528344671201814, "grad_norm": 0.05709443241357803, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 53516454.0, "rewards/KL_reward/mean": -0.14998161792755127, "rewards/KL_reward/std": 0.086819127202034, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/angle_reward/mean": -0.05606694519519806, "rewards/angle_reward/std": 0.6755439043045044, "rewards/thinking_verbosity_reward/mean": -0.7366902232170105, "rewards/thinking_verbosity_reward/std": 0.18104685842990875, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 264.4453125, "epoch": 0.5294784580498866, "grad_norm": 0.06272619962692261, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53581967.0, "rewards/KL_reward/mean": -0.14781680703163147, "rewards/KL_reward/std": 0.08237636089324951, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.061271876096725464, "rewards/angle_reward/std": 0.6824977993965149, "rewards/thinking_verbosity_reward/mean": -0.79157555103302, "rewards/thinking_verbosity_reward/std": 0.17959390580654144, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 274.828125, "epoch": 0.5306122448979592, "grad_norm": 0.07420215010643005, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 53649417.0, "rewards/KL_reward/mean": -0.16903798282146454, "rewards/KL_reward/std": 0.08875898271799088, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": -0.021459292620420456, "rewards/angle_reward/std": 0.7030213475227356, "rewards/thinking_verbosity_reward/mean": -0.8034688234329224, "rewards/thinking_verbosity_reward/std": 0.19828097522258759, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 237.3828125, "epoch": 0.5317460317460317, "grad_norm": 0.04708437994122505, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53711346.0, "rewards/KL_reward/mean": -0.17129938304424286, "rewards/KL_reward/std": 0.09669040143489838, "rewards/accuracy_reward/mean": 0.2265625, "rewards/accuracy_reward/std": 0.4202519655227661, "rewards/angle_reward/mean": -0.0005590301007032394, "rewards/angle_reward/std": 0.702396035194397, "rewards/thinking_verbosity_reward/mean": -0.7507597208023071, "rewards/thinking_verbosity_reward/std": 0.1658087521791458, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 206.328125, "epoch": 0.5328798185941043, "grad_norm": 0.07587230950593948, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53770020.0, "rewards/KL_reward/mean": -0.21926406025886536, "rewards/KL_reward/std": 0.10634764283895493, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.0663021057844162, "rewards/angle_reward/std": 0.7422232031822205, "rewards/thinking_verbosity_reward/mean": -0.7035611867904663, "rewards/thinking_verbosity_reward/std": 0.1358904093503952, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 214.9140625, "epoch": 0.5340136054421769, "grad_norm": 0.07078830152750015, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53829705.0, "rewards/KL_reward/mean": -0.24120840430259705, "rewards/KL_reward/std": 0.17651721835136414, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/angle_reward/mean": -0.03783184662461281, "rewards/angle_reward/std": 0.6946678757667542, "rewards/thinking_verbosity_reward/mean": -0.7069072723388672, "rewards/thinking_verbosity_reward/std": 0.18784119188785553, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 216.9140625, "epoch": 0.5351473922902494, "grad_norm": 0.09605925530195236, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 53889510.0, "rewards/KL_reward/mean": -0.25455862283706665, "rewards/KL_reward/std": 0.2185129076242447, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.194504976272583, "rewards/angle_reward/mean": -0.03943023830652237, "rewards/angle_reward/std": 0.7367250323295593, "rewards/thinking_verbosity_reward/mean": -0.7035454511642456, "rewards/thinking_verbosity_reward/std": 0.21246668696403503, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 171.3125, "epoch": 0.536281179138322, "grad_norm": 0.09049979597330093, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 53943630.0, "rewards/KL_reward/mean": -0.277265727519989, "rewards/KL_reward/std": 0.13049979507923126, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": -0.06564237922430038, "rewards/angle_reward/std": 0.7101609110832214, "rewards/thinking_verbosity_reward/mean": -0.6378944516181946, "rewards/thinking_verbosity_reward/std": 0.13784360885620117, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 148.8125, "epoch": 0.5374149659863946, "grad_norm": 0.11495912820100784, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 53994934.0, "rewards/KL_reward/mean": -0.34224867820739746, "rewards/KL_reward/std": 0.19699767231941223, "rewards/accuracy_reward/mean": 0.0390625, "rewards/accuracy_reward/std": 0.194504976272583, "rewards/angle_reward/mean": -0.008879505097866058, "rewards/angle_reward/std": 0.6997842788696289, "rewards/thinking_verbosity_reward/mean": -0.5875452160835266, "rewards/thinking_verbosity_reward/std": 0.15656743943691254, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 129.625, "epoch": 0.5385487528344671, "grad_norm": 0.1606689691543579, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 54043854.0, "rewards/KL_reward/mean": -0.4127693474292755, "rewards/KL_reward/std": 0.2230147123336792, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.1325501799583435, "rewards/angle_reward/std": 0.7190689444541931, "rewards/thinking_verbosity_reward/mean": -0.547183632850647, "rewards/thinking_verbosity_reward/std": 0.14929042756557465, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 140.9296875, "epoch": 0.5396825396825397, "grad_norm": 0.13437631726264954, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54094013.0, "rewards/KL_reward/mean": -0.4838368892669678, "rewards/KL_reward/std": 0.4108221232891083, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.0751531645655632, "rewards/angle_reward/std": 0.7036384344100952, "rewards/thinking_verbosity_reward/mean": -0.5652834177017212, "rewards/thinking_verbosity_reward/std": 0.17469199001789093, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 114.9453125, "epoch": 0.5408163265306123, "grad_norm": 0.22570781409740448, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54140718.0, "rewards/KL_reward/mean": -0.5766152143478394, "rewards/KL_reward/std": 0.5207230448722839, "rewards/accuracy_reward/mean": 0.0546875, "rewards/accuracy_reward/std": 0.22826264798641205, "rewards/angle_reward/mean": -0.05803005397319794, "rewards/angle_reward/std": 0.7059148550033569, "rewards/thinking_verbosity_reward/mean": -0.5080236196517944, "rewards/thinking_verbosity_reward/std": 0.16427448391914368, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 90.3671875, "epoch": 0.5419501133786848, "grad_norm": 0.5434188842773438, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54184253.0, "rewards/KL_reward/mean": -0.7356384992599487, "rewards/KL_reward/std": 0.6622855067253113, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": -0.10792765021324158, "rewards/angle_reward/std": 0.7056306004524231, "rewards/thinking_verbosity_reward/mean": -0.4460752606391907, "rewards/thinking_verbosity_reward/std": 0.15693074464797974, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 83.7890625, "epoch": 0.5430839002267573, "grad_norm": 0.6073722243309021, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54227074.0, "rewards/KL_reward/mean": -0.9975966215133667, "rewards/KL_reward/std": 1.713445782661438, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.06350193172693253, "rewards/angle_reward/std": 0.7050646543502808, "rewards/thinking_verbosity_reward/mean": -0.4233494699001312, "rewards/thinking_verbosity_reward/std": 0.167231485247612, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 37.3046875, "epoch": 0.54421768707483, "grad_norm": 1.9195548295974731, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54264161.0, "rewards/KL_reward/mean": -1.834166169166565, "rewards/KL_reward/std": 1.3385273218154907, "rewards/accuracy_reward/mean": 0.078125, "rewards/accuracy_reward/std": 0.2694226801395416, "rewards/angle_reward/mean": 0.05574992671608925, "rewards/angle_reward/std": 0.711519181728363, "rewards/thinking_verbosity_reward/mean": -0.2586079239845276, "rewards/thinking_verbosity_reward/std": 0.15521405637264252, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 22.3984375, "epoch": 0.5453514739229025, "grad_norm": 0.7810635566711426, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54299196.0, "rewards/KL_reward/mean": -2.171604871749878, "rewards/KL_reward/std": 1.0320184230804443, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12450689822435379, "rewards/angle_reward/mean": 0.13008692860603333, "rewards/angle_reward/std": 0.6445255875587463, "rewards/thinking_verbosity_reward/mean": -0.1941802203655243, "rewards/thinking_verbosity_reward/std": 0.12615256011486053, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 8.65625, "epoch": 0.546485260770975, "grad_norm": 1.0425291061401367, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54331920.0, "rewards/KL_reward/mean": -2.89699125289917, "rewards/KL_reward/std": 1.0191465616226196, "rewards/accuracy_reward/mean": 0.1328125, "rewards/accuracy_reward/std": 0.3407054841518402, "rewards/angle_reward/mean": 0.11439374834299088, "rewards/angle_reward/std": 0.6321514844894409, "rewards/thinking_verbosity_reward/mean": -0.13150997459888458, "rewards/thinking_verbosity_reward/std": 0.04313100129365921, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 8.6328125, "epoch": 0.5476190476190477, "grad_norm": 0.6215812563896179, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54365409.0, "rewards/KL_reward/mean": -3.594916582107544, "rewards/KL_reward/std": 1.5705430507659912, "rewards/accuracy_reward/mean": 0.03125, "rewards/accuracy_reward/std": 0.1746762990951538, "rewards/angle_reward/mean": -0.04716882482171059, "rewards/angle_reward/std": 0.5511569380760193, "rewards/thinking_verbosity_reward/mean": -0.12736451625823975, "rewards/thinking_verbosity_reward/std": 0.05369199439883232, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 7.5703125, "epoch": 0.5487528344671202, "grad_norm": 0.5058205127716064, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54398402.0, "rewards/KL_reward/mean": -3.5983033180236816, "rewards/KL_reward/std": 0.9799101948738098, "rewards/accuracy_reward/mean": 0.046875, "rewards/accuracy_reward/std": 0.21220162510871887, "rewards/angle_reward/mean": -0.014187419787049294, "rewards/angle_reward/std": 0.6621562242507935, "rewards/thinking_verbosity_reward/mean": -0.1267462968826294, "rewards/thinking_verbosity_reward/std": 0.019078785553574562, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 7.640625, "epoch": 0.5498866213151927, "grad_norm": 0.5460374355316162, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 54431804.0, "rewards/KL_reward/mean": -3.4369935989379883, "rewards/KL_reward/std": 1.0887336730957031, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": 0.017852269113063812, "rewards/angle_reward/std": 0.6593750715255737, "rewards/thinking_verbosity_reward/mean": -0.1271992027759552, "rewards/thinking_verbosity_reward/std": 0.020621543750166893, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 7.5390625, "epoch": 0.5510204081632653, "grad_norm": 0.5307416915893555, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 54464305.0, "rewards/KL_reward/mean": -3.6182875633239746, "rewards/KL_reward/std": 1.1485244035720825, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/angle_reward/mean": 0.022097047418355942, "rewards/angle_reward/std": 0.569690465927124, "rewards/thinking_verbosity_reward/mean": -0.12511225044727325, "rewards/thinking_verbosity_reward/std": 0.026458468288183212, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 7.4140625, "epoch": 0.5521541950113379, "grad_norm": 0.857745349407196, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 54497494.0, "rewards/KL_reward/mean": -3.4195375442504883, "rewards/KL_reward/std": 0.8222663998603821, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12450689822435379, "rewards/angle_reward/mean": -0.07733979821205139, "rewards/angle_reward/std": 0.6351537108421326, "rewards/thinking_verbosity_reward/mean": -0.12590113282203674, "rewards/thinking_verbosity_reward/std": 0.014324675314128399, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 7.125, "epoch": 0.5532879818594104, "grad_norm": 0.7705725431442261, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54530838.0, "rewards/KL_reward/mean": -3.646390438079834, "rewards/KL_reward/std": 1.0909744501113892, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.012675169855356216, "rewards/angle_reward/std": 0.7345511317253113, "rewards/thinking_verbosity_reward/mean": -0.12285362184047699, "rewards/thinking_verbosity_reward/std": 0.01487329788506031, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 6.453125, "epoch": 0.5544217687074829, "grad_norm": 1.0522595643997192, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54562448.0, "rewards/KL_reward/mean": -3.8998897075653076, "rewards/KL_reward/std": 1.3035476207733154, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.322907418012619, "rewards/angle_reward/mean": -0.05524272099137306, "rewards/angle_reward/std": 0.5859870910644531, "rewards/thinking_verbosity_reward/mean": -0.11607244610786438, "rewards/thinking_verbosity_reward/std": 0.012698753736913204, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 7.515625, "epoch": 0.5555555555555556, "grad_norm": 0.7135828137397766, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54594826.0, "rewards/KL_reward/mean": -3.6429526805877686, "rewards/KL_reward/std": 2.472461462020874, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12450689822435379, "rewards/angle_reward/mean": 0.021636418998241425, "rewards/angle_reward/std": 0.6236943602561951, "rewards/thinking_verbosity_reward/mean": -0.1257990598678589, "rewards/thinking_verbosity_reward/std": 0.021617397665977478, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 6.9453125, "epoch": 0.5566893424036281, "grad_norm": 0.8383898735046387, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54628283.0, "rewards/KL_reward/mean": -3.845817804336548, "rewards/KL_reward/std": 0.9958040118217468, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12450689822435379, "rewards/angle_reward/mean": -0.1233597844839096, "rewards/angle_reward/std": 0.6113914251327515, "rewards/thinking_verbosity_reward/mean": -0.12134502083063126, "rewards/thinking_verbosity_reward/std": 0.011821961961686611, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 7.03125, "epoch": 0.5578231292517006, "grad_norm": 1.1921794414520264, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 54661407.0, "rewards/KL_reward/mean": -3.6719002723693848, "rewards/KL_reward/std": 1.2782294750213623, "rewards/accuracy_reward/mean": 0.0703125, "rewards/accuracy_reward/std": 0.2566775679588318, "rewards/angle_reward/mean": -0.06361042708158493, "rewards/angle_reward/std": 0.6366872191429138, "rewards/thinking_verbosity_reward/mean": -0.1219245195388794, "rewards/thinking_verbosity_reward/std": 0.0146359046921134, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 6.4375, "epoch": 0.5589569160997733, "grad_norm": 1.3316749334335327, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 54694175.0, "rewards/KL_reward/mean": -4.398461818695068, "rewards/KL_reward/std": 1.5992577075958252, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.007545974105596542, "rewards/angle_reward/std": 0.6428928971290588, "rewards/thinking_verbosity_reward/mean": -0.11585833877325058, "rewards/thinking_verbosity_reward/std": 0.013112529180943966, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 5.6796875, "epoch": 0.5600907029478458, "grad_norm": 2.5037853717803955, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 54726870.0, "rewards/KL_reward/mean": -5.423948764801025, "rewards/KL_reward/std": 2.345402240753174, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24301259219646454, "rewards/angle_reward/mean": -0.11048542708158493, "rewards/angle_reward/std": 0.6301949620246887, "rewards/thinking_verbosity_reward/mean": -0.10674476623535156, "rewards/thinking_verbosity_reward/std": 0.017526322975754738, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 5.3359375, "epoch": 0.5612244897959183, "grad_norm": 2.2523908615112305, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 54759745.0, "rewards/KL_reward/mean": -6.66871452331543, "rewards/KL_reward/std": 2.817564010620117, "rewards/accuracy_reward/mean": 0.0859375, "rewards/accuracy_reward/std": 0.2813730239868164, "rewards/angle_reward/mean": 0.011048581451177597, "rewards/angle_reward/std": 0.6146785616874695, "rewards/thinking_verbosity_reward/mean": -0.10279585421085358, "rewards/thinking_verbosity_reward/std": 0.016583282500505447, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.1796875, "epoch": 0.562358276643991, "grad_norm": 0.9493520855903625, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 54791880.0, "rewards/KL_reward/mean": -8.693005561828613, "rewards/KL_reward/std": 1.4315944910049438, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.06629125773906708, "rewards/angle_reward/std": 0.3486475646495819, "rewards/thinking_verbosity_reward/mean": -0.08882145583629608, "rewards/thinking_verbosity_reward/std": 0.00777425104752183, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0859375, "epoch": 0.5634920634920635, "grad_norm": 0.9398501515388489, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 54824803.0, "rewards/KL_reward/mean": -8.986701011657715, "rewards/KL_reward/std": 1.7665596008300781, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.07733979821205139, "rewards/angle_reward/std": 0.3463462293148041, "rewards/thinking_verbosity_reward/mean": -0.08729038387537003, "rewards/thinking_verbosity_reward/std": 0.009797018021345139, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.564625850340136, "grad_norm": 0.1411595642566681, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 54857107.0, "rewards/KL_reward/mean": -9.378817558288574, "rewards/KL_reward/std": 1.6368354558944702, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5657596371882087, "grad_norm": 0.12384044378995895, "learning_rate": 5e-05, "loss": -0.0022, "num_tokens": 54890211.0, "rewards/KL_reward/mean": -10.19272232055664, "rewards/KL_reward/std": 1.4332489967346191, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0390625, "epoch": 0.5668934240362812, "grad_norm": 0.21007029712200165, "learning_rate": 5e-05, "loss": 0.0034, "num_tokens": 54922392.0, "rewards/KL_reward/mean": -9.341675758361816, "rewards/KL_reward/std": 1.3299744129180908, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.08688278496265411, "rewards/thinking_verbosity_reward/std": 0.003170661861076951, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5680272108843537, "grad_norm": 0.08080130070447922, "learning_rate": 5e-05, "loss": -0.0014, "num_tokens": 54954752.0, "rewards/KL_reward/mean": -9.528559684753418, "rewards/KL_reward/std": 1.6406162977218628, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5691609977324263, "grad_norm": 0.14609335362911224, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 54987352.0, "rewards/KL_reward/mean": -9.704826354980469, "rewards/KL_reward/std": 1.48715078830719, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0078125, "epoch": 0.5702947845804989, "grad_norm": 0.20738476514816284, "learning_rate": 5e-05, "loss": -0.0043, "num_tokens": 55020329.0, "rewards/KL_reward/mean": -9.367766380310059, "rewards/KL_reward/std": 0.917743444442749, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.086707204580307, "rewards/thinking_verbosity_reward/std": 0.0011841795640066266, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5714285714285714, "grad_norm": 0.0858779102563858, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 55052305.0, "rewards/KL_reward/mean": -9.519469261169434, "rewards/KL_reward/std": 1.7006648778915405, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.9921875, "epoch": 0.572562358276644, "grad_norm": 0.564150333404541, "learning_rate": 5e-05, "loss": 0.0076, "num_tokens": 55084392.0, "rewards/KL_reward/mean": -9.431741714477539, "rewards/KL_reward/std": 1.0106531381607056, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.08647838234901428, "rewards/thinking_verbosity_reward/std": 0.0014046551659703255, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5736961451247166, "grad_norm": 0.06539922207593918, "learning_rate": 5e-05, "loss": -0.0044, "num_tokens": 55117504.0, "rewards/KL_reward/mean": -9.063766479492188, "rewards/KL_reward/std": 1.2047866582870483, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5748299319727891, "grad_norm": 0.15550510585308075, "learning_rate": 5e-05, "loss": 0.0014, "num_tokens": 55150368.0, "rewards/KL_reward/mean": -9.597511291503906, "rewards/KL_reward/std": 1.5458221435546875, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5759637188208617, "grad_norm": 0.10294591635465622, "learning_rate": 5e-05, "loss": -0.0018, "num_tokens": 55182768.0, "rewards/KL_reward/mean": -9.482969284057617, "rewards/KL_reward/std": 1.1649845838546753, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5770975056689343, "grad_norm": 0.07005158066749573, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 55215344.0, "rewards/KL_reward/mean": -9.859160423278809, "rewards/KL_reward/std": 1.0055972337722778, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5782312925170068, "grad_norm": 0.11765587329864502, "learning_rate": 5e-05, "loss": -0.003, "num_tokens": 55248264.0, "rewards/KL_reward/mean": -9.879338264465332, "rewards/KL_reward/std": 1.404909372329712, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5793650793650794, "grad_norm": 0.11884509772062302, "learning_rate": 5e-05, "loss": 0.0025, "num_tokens": 55281080.0, "rewards/KL_reward/mean": -9.526758193969727, "rewards/KL_reward/std": 1.5549665689468384, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0234375, "epoch": 0.5804988662131519, "grad_norm": 0.2688693106174469, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 55313387.0, "rewards/KL_reward/mean": -9.45893383026123, "rewards/KL_reward/std": 1.2606397867202759, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.022097086533904076, "rewards/angle_reward/std": 0.17607934772968292, "rewards/thinking_verbosity_reward/mean": -0.0868721455335617, "rewards/thinking_verbosity_reward/std": 0.0034520491026341915, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5816326530612245, "grad_norm": 0.06777095049619675, "learning_rate": 5e-05, "loss": 0.0014, "num_tokens": 55345827.0, "rewards/KL_reward/mean": -9.774582862854004, "rewards/KL_reward/std": 1.4200104475021362, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5827664399092971, "grad_norm": 0.08141104876995087, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 55378259.0, "rewards/KL_reward/mean": -9.087238311767578, "rewards/KL_reward/std": 1.020469069480896, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.015625, "epoch": 0.5839002267573696, "grad_norm": 0.3209582567214966, "learning_rate": 5e-05, "loss": 0.0022, "num_tokens": 55410893.0, "rewards/KL_reward/mean": -9.666126251220703, "rewards/KL_reward/std": 1.3984832763671875, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.08679942041635513, "rewards/thinking_verbosity_reward/std": 0.002227462362498045, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5850340136054422, "grad_norm": 0.11173339933156967, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 55443661.0, "rewards/KL_reward/mean": -9.571331024169922, "rewards/KL_reward/std": 0.9361988306045532, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5861678004535147, "grad_norm": 0.15266042947769165, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 55476573.0, "rewards/KL_reward/mean": -9.854283332824707, "rewards/KL_reward/std": 1.109019160270691, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5873015873015873, "grad_norm": 0.2016039490699768, "learning_rate": 5e-05, "loss": -0.0027, "num_tokens": 55509413.0, "rewards/KL_reward/mean": -9.629522323608398, "rewards/KL_reward/std": 1.0815303325653076, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5884353741496599, "grad_norm": 0.07484356313943863, "learning_rate": 5e-05, "loss": -0.0012, "num_tokens": 55541789.0, "rewards/KL_reward/mean": -8.9932861328125, "rewards/KL_reward/std": 0.9899211525917053, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.9921875, "epoch": 0.5895691609977324, "grad_norm": 0.37136051058769226, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 55574068.0, "rewards/KL_reward/mean": -9.123318672180176, "rewards/KL_reward/std": 1.024639368057251, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.08647838234901428, "rewards/thinking_verbosity_reward/std": 0.0014046551659703255, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.590702947845805, "grad_norm": 0.0857769325375557, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 55606356.0, "rewards/KL_reward/mean": -9.66751766204834, "rewards/KL_reward/std": 1.013816237449646, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5918367346938775, "grad_norm": 0.0768849328160286, "learning_rate": 5e-05, "loss": 0.0022, "num_tokens": 55638556.0, "rewards/KL_reward/mean": -9.529890060424805, "rewards/KL_reward/std": 0.9348580241203308, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5929705215419501, "grad_norm": 0.16719675064086914, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 55671244.0, "rewards/KL_reward/mean": -9.763287544250488, "rewards/KL_reward/std": 1.6031088829040527, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.9921875, "epoch": 0.5941043083900227, "grad_norm": 0.39724478125572205, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 55702843.0, "rewards/KL_reward/mean": -9.557028770446777, "rewards/KL_reward/std": 1.890602946281433, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.08647838234901428, "rewards/thinking_verbosity_reward/std": 0.0014046551659703255, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5952380952380952, "grad_norm": 0.1424868106842041, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 55735427.0, "rewards/KL_reward/mean": -9.618017196655273, "rewards/KL_reward/std": 1.1586666107177734, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5963718820861678, "grad_norm": 0.1281495839357376, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 55767619.0, "rewards/KL_reward/mean": -9.235986709594727, "rewards/KL_reward/std": 1.3936333656311035, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.9765625, "epoch": 0.5975056689342404, "grad_norm": 0.827623188495636, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 55799872.0, "rewards/KL_reward/mean": -9.611066818237305, "rewards/KL_reward/std": 1.5234673023223877, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.06629125773906708, "rewards/angle_reward/std": 0.24199791252613068, "rewards/thinking_verbosity_reward/mean": -0.08619242906570435, "rewards/thinking_verbosity_reward/std": 0.003516852855682373, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0234375, "epoch": 0.5986394557823129, "grad_norm": 0.4614194631576538, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 55832363.0, "rewards/KL_reward/mean": -9.693451881408691, "rewards/KL_reward/std": 1.0000463724136353, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.08688278496265411, "rewards/thinking_verbosity_reward/std": 0.003170661861076951, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.5997732426303855, "grad_norm": 0.08846230804920197, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 55864427.0, "rewards/KL_reward/mean": -9.651050567626953, "rewards/KL_reward/std": 1.1203112602233887, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, "rewards/thinking_verbosity_reward/std": 0.0, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.984375, "epoch": 0.6009070294784581, "grad_norm": 0.8824671506881714, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 55897281.0, "rewards/KL_reward/mean": -9.703022956848145, "rewards/KL_reward/std": 1.6452223062515259, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.06629125773906708, "rewards/angle_reward/std": 0.24199791252613068, "rewards/thinking_verbosity_reward/mean": -0.08635422587394714, "rewards/thinking_verbosity_reward/std": 0.001978646032512188, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.9609375, "epoch": 0.6020408163265306, "grad_norm": 0.7833751440048218, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 55929860.0, "rewards/KL_reward/mean": -10.333253860473633, "rewards/KL_reward/std": 1.588026762008667, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.13258251547813416, "rewards/angle_reward/std": 0.3290405869483948, "rewards/thinking_verbosity_reward/mean": -0.08594411611557007, "rewards/thinking_verbosity_reward/std": 0.0040097408927977085, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.9921875, "epoch": 0.6031746031746031, "grad_norm": 1.585099220275879, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 55962315.0, "rewards/KL_reward/mean": -9.991573333740234, "rewards/KL_reward/std": 1.682023048400879, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.03314562886953354, "rewards/angle_reward/std": 0.46836432814598083, "rewards/thinking_verbosity_reward/mean": -0.08640043437480927, "rewards/thinking_verbosity_reward/std": 0.003943993244320154, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.828125, "epoch": 0.6043083900226758, "grad_norm": 2.4811768531799316, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 55995021.0, "rewards/KL_reward/mean": -9.870138168334961, "rewards/KL_reward/std": 2.0301644802093506, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.3425048589706421, "rewards/angle_reward/std": 0.5096268057823181, "rewards/thinking_verbosity_reward/mean": -0.08377633988857269, "rewards/thinking_verbosity_reward/std": 0.00722804618999362, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 29.84375, "epoch": 0.6054421768707483, "grad_norm": 2.358689546585083, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56031185.0, "rewards/KL_reward/mean": -11.116844177246094, "rewards/KL_reward/std": 4.051690578460693, "rewards/accuracy_reward/mean": 0.015625, "rewards/accuracy_reward/std": 0.12450689822435379, "rewards/angle_reward/mean": -0.005799375474452972, "rewards/angle_reward/std": 0.6632209420204163, "rewards/thinking_verbosity_reward/mean": -0.10786447674036026, "rewards/thinking_verbosity_reward/std": 0.24688217043876648, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.40625, "epoch": 0.6065759637188208, "grad_norm": 3.3132803440093994, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56063637.0, "rewards/KL_reward/mean": -12.70104694366455, "rewards/KL_reward/std": 3.0239720344543457, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/angle_reward/mean": 0.03127017617225647, "rewards/angle_reward/std": 0.7112759947776794, "rewards/thinking_verbosity_reward/mean": -0.07684475183486938, "rewards/thinking_verbosity_reward/std": 0.01144831720739603, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.59375, "epoch": 0.6077097505668935, "grad_norm": 1.9887434244155884, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56095833.0, "rewards/KL_reward/mean": -13.586993217468262, "rewards/KL_reward/std": 3.1474719047546387, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.14363107085227966, "rewards/angle_reward/std": 0.5976290106773376, "rewards/thinking_verbosity_reward/mean": -0.07788439095020294, "rewards/thinking_verbosity_reward/std": 0.020535118877887726, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.1328125, "epoch": 0.608843537414966, "grad_norm": 1.1461775302886963, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 56127986.0, "rewards/KL_reward/mean": -15.27037239074707, "rewards/KL_reward/std": 2.396375894546509, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.22097085416316986, "rewards/angle_reward/std": 0.4138355255126953, "rewards/thinking_verbosity_reward/mean": -0.07201752811670303, "rewards/thinking_verbosity_reward/std": 0.012897231616079807, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.0234375, "epoch": 0.6099773242630385, "grad_norm": 2.3434665203094482, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56160565.0, "rewards/KL_reward/mean": -14.925134658813477, "rewards/KL_reward/std": 1.6955004930496216, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/angle_reward/mean": 0.006519727408885956, "rewards/angle_reward/std": 0.4306424856185913, "rewards/thinking_verbosity_reward/mean": -0.0707487016916275, "rewards/thinking_verbosity_reward/std": 0.00732355285435915, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 26.5390625, "epoch": 0.6111111111111112, "grad_norm": 4.019361972808838, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 56195946.0, "rewards/KL_reward/mean": -14.851444244384766, "rewards/KL_reward/std": 2.7126476764678955, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.022097084671258926, "rewards/angle_reward/std": 0.39621734619140625, "rewards/thinking_verbosity_reward/mean": -0.08837562799453735, "rewards/thinking_verbosity_reward/std": 0.23769377171993256, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 8.140625, "epoch": 0.6122448979591837, "grad_norm": 6.1977362632751465, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56228708.0, "rewards/KL_reward/mean": -16.46318817138672, "rewards/KL_reward/std": 3.266050100326538, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0883883386850357, "rewards/angle_reward/std": 0.5541539788246155, "rewards/thinking_verbosity_reward/mean": -0.0723920613527298, "rewards/thinking_verbosity_reward/std": 0.11273974925279617, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 51.421875, "epoch": 0.6133786848072562, "grad_norm": 6.258033752441406, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56266698.0, "rewards/KL_reward/mean": -14.750425338745117, "rewards/KL_reward/std": 4.972354888916016, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.09943688660860062, "rewards/angle_reward/std": 0.6066194772720337, "rewards/thinking_verbosity_reward/mean": -0.11021846532821655, "rewards/thinking_verbosity_reward/std": 0.33888471126556396, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.9609375, "epoch": 0.6145124716553289, "grad_norm": 2.545353889465332, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56299501.0, "rewards/KL_reward/mean": -13.780776023864746, "rewards/KL_reward/std": 4.725129127502441, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.15467959642410278, "rewards/angle_reward/std": 0.5092645287513733, "rewards/thinking_verbosity_reward/mean": -0.06911271065473557, "rewards/thinking_verbosity_reward/std": 0.07187584787607193, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 45.65625, "epoch": 0.6156462585034014, "grad_norm": 8.453726768493652, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 56337401.0, "rewards/KL_reward/mean": -15.310544967651367, "rewards/KL_reward/std": 4.958581447601318, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.055242717266082764, "rewards/angle_reward/std": 0.5859870910644531, "rewards/thinking_verbosity_reward/mean": -0.11464644968509674, "rewards/thinking_verbosity_reward/std": 0.31510642170906067, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 2.7890625, "epoch": 0.6167800453514739, "grad_norm": 4.095304489135742, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56370014.0, "rewards/KL_reward/mean": -19.383827209472656, "rewards/KL_reward/std": 6.920942783355713, "rewards/accuracy_reward/mean": 0.0234375, "rewards/accuracy_reward/std": 0.15188287198543549, "rewards/angle_reward/mean": 0.09018324315547943, "rewards/angle_reward/std": 0.4330177307128906, "rewards/thinking_verbosity_reward/mean": -0.0450855977833271, "rewards/thinking_verbosity_reward/std": 0.049589890986680984, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.6171875, "epoch": 0.6179138321995464, "grad_norm": 18.613027572631836, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 56402205.0, "rewards/KL_reward/mean": -22.613176345825195, "rewards/KL_reward/std": 8.32883071899414, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.07733979821205139, "rewards/angle_reward/std": 0.46308088302612305, "rewards/thinking_verbosity_reward/mean": -0.024793803691864014, "rewards/thinking_verbosity_reward/std": 0.03058668039739132, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.2109375, "epoch": 0.6190476190476191, "grad_norm": 6.894687652587891, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 56433896.0, "rewards/KL_reward/mean": -26.91053009033203, "rewards/KL_reward/std": 7.242550849914551, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.055242717266082764, "rewards/angle_reward/std": 0.3505830466747284, "rewards/thinking_verbosity_reward/mean": -0.010546875186264515, "rewards/thinking_verbosity_reward/std": 0.020478859543800354, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.046875, "epoch": 0.6201814058956916, "grad_norm": 5.023636341094971, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 56464654.0, "rewards/KL_reward/mean": -32.346656799316406, "rewards/KL_reward/std": 3.809330940246582, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.06629125773906708, "rewards/angle_reward/std": 0.24199791252613068, "rewards/thinking_verbosity_reward/mean": -0.0013531646691262722, "rewards/thinking_verbosity_reward/std": 0.010782613418996334, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6213151927437641, "grad_norm": 0.4634699821472168, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 56496894.0, "rewards/KL_reward/mean": -33.13923645019531, "rewards/KL_reward/std": 1.1342289447784424, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6224489795918368, "grad_norm": 0.7207654714584351, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 56528990.0, "rewards/KL_reward/mean": -33.207855224609375, "rewards/KL_reward/std": 1.6682014465332031, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.6235827664399093, "grad_norm": 1.3963048458099365, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 56560783.0, "rewards/KL_reward/mean": -33.020751953125, "rewards/KL_reward/std": 1.8156154155731201, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6247165532879818, "grad_norm": 0.6619357466697693, "learning_rate": 5e-05, "loss": -0.0022, "num_tokens": 56592831.0, "rewards/KL_reward/mean": -32.45671463012695, "rewards/KL_reward/std": 4.836127758026123, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6258503401360545, "grad_norm": 0.4096638858318329, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 56624415.0, "rewards/KL_reward/mean": -33.64544677734375, "rewards/KL_reward/std": 0.7837253212928772, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.626984126984127, "grad_norm": 0.035003989934921265, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 56656871.0, "rewards/KL_reward/mean": -33.58020782470703, "rewards/KL_reward/std": 1.1956831216812134, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6281179138321995, "grad_norm": 0.002811228157952428, "learning_rate": 5e-05, "loss": -0.003, "num_tokens": 56688807.0, "rewards/KL_reward/mean": -33.92033386230469, "rewards/KL_reward/std": 0.7668801546096802, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6292517006802721, "grad_norm": 0.0012724545085802674, "learning_rate": 5e-05, "loss": -0.0015, "num_tokens": 56720983.0, "rewards/KL_reward/mean": -33.561241149902344, "rewards/KL_reward/std": 0.7718645334243774, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6303854875283447, "grad_norm": 0.002310891402885318, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 56753151.0, "rewards/KL_reward/mean": -33.38077163696289, "rewards/KL_reward/std": 1.3701101541519165, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6315192743764172, "grad_norm": 0.23985038697719574, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 56785231.0, "rewards/KL_reward/mean": -33.302833557128906, "rewards/KL_reward/std": 1.7017228603363037, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6326530612244898, "grad_norm": 0.00048049696488305926, "learning_rate": 5e-05, "loss": 0.0021, "num_tokens": 56817119.0, "rewards/KL_reward/mean": -33.43657684326172, "rewards/KL_reward/std": 0.8716328144073486, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6337868480725624, "grad_norm": 0.6322202086448669, "learning_rate": 5e-05, "loss": -0.0021, "num_tokens": 56848839.0, "rewards/KL_reward/mean": -32.780845642089844, "rewards/KL_reward/std": 2.6573376655578613, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6349206349206349, "grad_norm": 0.5810360908508301, "learning_rate": 5e-05, "loss": 0.0009, "num_tokens": 56881199.0, "rewards/KL_reward/mean": -32.47676086425781, "rewards/KL_reward/std": 2.3711776733398438, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6360544217687075, "grad_norm": 0.40197378396987915, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 56913071.0, "rewards/KL_reward/mean": -32.77272415161133, "rewards/KL_reward/std": 3.1013097763061523, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.63718820861678, "grad_norm": 0.0005623187753371894, "learning_rate": 5e-05, "loss": 0.0033, "num_tokens": 56945607.0, "rewards/KL_reward/mean": -33.655338287353516, "rewards/KL_reward/std": 1.1464321613311768, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6383219954648526, "grad_norm": 0.0015330812893807888, "learning_rate": 5e-05, "loss": 0.0019, "num_tokens": 56977527.0, "rewards/KL_reward/mean": -32.998355865478516, "rewards/KL_reward/std": 1.5066907405853271, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6394557823129252, "grad_norm": 0.33337169885635376, "learning_rate": 5e-05, "loss": -0.0015, "num_tokens": 57009295.0, "rewards/KL_reward/mean": -33.21354675292969, "rewards/KL_reward/std": 1.2528866529464722, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6405895691609977, "grad_norm": 0.0013399942545220256, "learning_rate": 5e-05, "loss": 0.002, "num_tokens": 57041639.0, "rewards/KL_reward/mean": -33.86585998535156, "rewards/KL_reward/std": 1.0116124153137207, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6417233560090703, "grad_norm": 0.05720170587301254, "learning_rate": 5e-05, "loss": -0.0012, "num_tokens": 57074119.0, "rewards/KL_reward/mean": -32.12410354614258, "rewards/KL_reward/std": 1.4921815395355225, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6428571428571429, "grad_norm": 0.0004908728296868503, "learning_rate": 5e-05, "loss": -0.0025, "num_tokens": 57106527.0, "rewards/KL_reward/mean": -33.23358154296875, "rewards/KL_reward/std": 0.9941945672035217, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6439909297052154, "grad_norm": 0.0002522620197851211, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 57138207.0, "rewards/KL_reward/mean": -33.54637145996094, "rewards/KL_reward/std": 1.3176603317260742, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.645124716553288, "grad_norm": 0.0027960508596152067, "learning_rate": 5e-05, "loss": -0.0043, "num_tokens": 57170455.0, "rewards/KL_reward/mean": -33.99753952026367, "rewards/KL_reward/std": 1.4594237804412842, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6462585034013606, "grad_norm": 0.0008815639885142446, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 57202839.0, "rewards/KL_reward/mean": -33.43678665161133, "rewards/KL_reward/std": 1.100521445274353, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6473922902494331, "grad_norm": 0.005347798112779856, "learning_rate": 5e-05, "loss": 0.001, "num_tokens": 57235287.0, "rewards/KL_reward/mean": -32.888587951660156, "rewards/KL_reward/std": 2.2769651412963867, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6485260770975056, "grad_norm": 0.0006010847282595932, "learning_rate": 5e-05, "loss": -0.0047, "num_tokens": 57266391.0, "rewards/KL_reward/mean": -33.827232360839844, "rewards/KL_reward/std": 1.2292903661727905, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6496598639455783, "grad_norm": 0.0024182789493352175, "learning_rate": 5e-05, "loss": 0.0044, "num_tokens": 57298743.0, "rewards/KL_reward/mean": -33.92839431762695, "rewards/KL_reward/std": 1.1869778633117676, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6507936507936508, "grad_norm": 0.0004037956241518259, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 57330807.0, "rewards/KL_reward/mean": -33.319618225097656, "rewards/KL_reward/std": 1.1940268278121948, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6519274376417233, "grad_norm": 0.0005184471956454217, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 57363559.0, "rewards/KL_reward/mean": -33.24909973144531, "rewards/KL_reward/std": 0.9304895401000977, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6530612244897959, "grad_norm": 0.00027723200037144125, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 57394887.0, "rewards/KL_reward/mean": -33.85890197753906, "rewards/KL_reward/std": 1.0348025560379028, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6541950113378685, "grad_norm": 0.01639043353497982, "learning_rate": 5e-05, "loss": 0.002, "num_tokens": 57427231.0, "rewards/KL_reward/mean": -33.93389892578125, "rewards/KL_reward/std": 0.9749399423599243, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.015625, "epoch": 0.655328798185941, "grad_norm": 0.1564824879169464, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 57459729.0, "rewards/KL_reward/mean": -33.030155181884766, "rewards/KL_reward/std": 1.307621717453003, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6564625850340136, "grad_norm": 0.00019246863666921854, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 57491377.0, "rewards/KL_reward/mean": -33.92146301269531, "rewards/KL_reward/std": 1.330231785774231, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6575963718820862, "grad_norm": 0.23386387526988983, "learning_rate": 5e-05, "loss": -0.0012, "num_tokens": 57523513.0, "rewards/KL_reward/mean": -33.39048767089844, "rewards/KL_reward/std": 2.2050530910491943, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6587301587301587, "grad_norm": 0.0003712023317348212, "learning_rate": 5e-05, "loss": -0.0015, "num_tokens": 57555297.0, "rewards/KL_reward/mean": -33.397918701171875, "rewards/KL_reward/std": 0.8269470930099487, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6598639455782312, "grad_norm": 0.0018605771474540234, "learning_rate": 5e-05, "loss": 0.0047, "num_tokens": 57587193.0, "rewards/KL_reward/mean": -33.85803985595703, "rewards/KL_reward/std": 1.7370949983596802, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6609977324263039, "grad_norm": 0.0011312238639220595, "learning_rate": 5e-05, "loss": -0.0051, "num_tokens": 57619657.0, "rewards/KL_reward/mean": -33.73335266113281, "rewards/KL_reward/std": 1.1629188060760498, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6621315192743764, "grad_norm": 0.00016034345026127994, "learning_rate": 5e-05, "loss": -0.0008, "num_tokens": 57651625.0, "rewards/KL_reward/mean": -33.82770919799805, "rewards/KL_reward/std": 0.7721322178840637, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6632653061224489, "grad_norm": 0.00107968517113477, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 57684313.0, "rewards/KL_reward/mean": -33.06117248535156, "rewards/KL_reward/std": 0.8302332162857056, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6643990929705216, "grad_norm": 0.001078012166544795, "learning_rate": 5e-05, "loss": 0.0025, "num_tokens": 57716849.0, "rewards/KL_reward/mean": -33.73332214355469, "rewards/KL_reward/std": 0.9741858839988708, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6655328798185941, "grad_norm": 0.004448353312909603, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 57749345.0, "rewards/KL_reward/mean": -33.36539840698242, "rewards/KL_reward/std": 1.5083750486373901, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6666666666666666, "grad_norm": 0.5591500997543335, "learning_rate": 5e-05, "loss": -0.007, "num_tokens": 57781753.0, "rewards/KL_reward/mean": -32.74077606201172, "rewards/KL_reward/std": 1.90312659740448, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6678004535147393, "grad_norm": 0.00034644355764612556, "learning_rate": 5e-05, "loss": -0.0046, "num_tokens": 57813553.0, "rewards/KL_reward/mean": -33.96821594238281, "rewards/KL_reward/std": 0.9595673084259033, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6689342403628118, "grad_norm": 0.5171715617179871, "learning_rate": 5e-05, "loss": -0.0045, "num_tokens": 57845481.0, "rewards/KL_reward/mean": -32.79689407348633, "rewards/KL_reward/std": 3.483654737472534, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6700680272108843, "grad_norm": 0.09763412922620773, "learning_rate": 5e-05, "loss": -0.0057, "num_tokens": 57877929.0, "rewards/KL_reward/mean": -33.133235931396484, "rewards/KL_reward/std": 1.5220880508422852, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.671201814058957, "grad_norm": 0.0015285967383533716, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 57910425.0, "rewards/KL_reward/mean": -33.20171356201172, "rewards/KL_reward/std": 1.681617259979248, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6723356009070295, "grad_norm": 0.00030617736047133803, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 57942897.0, "rewards/KL_reward/mean": -33.10102462768555, "rewards/KL_reward/std": 1.0969640016555786, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.673469387755102, "grad_norm": 0.31258565187454224, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 57975081.0, "rewards/KL_reward/mean": -34.14094543457031, "rewards/KL_reward/std": 1.3639986515045166, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6746031746031746, "grad_norm": 0.0006680377409793437, "learning_rate": 5e-05, "loss": 0.002, "num_tokens": 58007609.0, "rewards/KL_reward/mean": -33.03845977783203, "rewards/KL_reward/std": 1.2797842025756836, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6757369614512472, "grad_norm": 0.0017703570192679763, "learning_rate": 5e-05, "loss": -0.0019, "num_tokens": 58039529.0, "rewards/KL_reward/mean": -33.55364990234375, "rewards/KL_reward/std": 0.9732753038406372, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6768707482993197, "grad_norm": 0.0001716313709039241, "learning_rate": 5e-05, "loss": -0.002, "num_tokens": 58070553.0, "rewards/KL_reward/mean": -33.6715087890625, "rewards/KL_reward/std": 0.7248656153678894, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6780045351473923, "grad_norm": 0.0002292812569066882, "learning_rate": 5e-05, "loss": 0.0042, "num_tokens": 58102009.0, "rewards/KL_reward/mean": -33.2886962890625, "rewards/KL_reward/std": 1.0974416732788086, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6791383219954649, "grad_norm": 0.0003618478949647397, "learning_rate": 5e-05, "loss": 0.0041, "num_tokens": 58134009.0, "rewards/KL_reward/mean": -34.09300231933594, "rewards/KL_reward/std": 0.9552999138832092, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6802721088435374, "grad_norm": 0.00029338515014387667, "learning_rate": 5e-05, "loss": 0.0029, "num_tokens": 58165673.0, "rewards/KL_reward/mean": -33.718257904052734, "rewards/KL_reward/std": 1.2162995338439941, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.68140589569161, "grad_norm": 0.002398415934294462, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 58196625.0, "rewards/KL_reward/mean": -33.8197135925293, "rewards/KL_reward/std": 1.4261441230773926, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6825396825396826, "grad_norm": 0.0002242231712443754, "learning_rate": 5e-05, "loss": 0.0032, "num_tokens": 58228697.0, "rewards/KL_reward/mean": -33.507415771484375, "rewards/KL_reward/std": 1.07442045211792, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6836734693877551, "grad_norm": 0.0006778687820769846, "learning_rate": 5e-05, "loss": 0.0028, "num_tokens": 58260633.0, "rewards/KL_reward/mean": -33.163368225097656, "rewards/KL_reward/std": 0.9972267150878906, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0390625, "epoch": 0.6848072562358276, "grad_norm": 0.5446889400482178, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 58292790.0, "rewards/KL_reward/mean": -33.51383590698242, "rewards/KL_reward/std": 2.0158700942993164, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0006765823345631361, "rewards/thinking_verbosity_reward/std": 0.00765465572476387, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6859410430839002, "grad_norm": 0.00029776192968711257, "learning_rate": 5e-05, "loss": -0.0016, "num_tokens": 58324862.0, "rewards/KL_reward/mean": -33.280799865722656, "rewards/KL_reward/std": 0.9887773990631104, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6870748299319728, "grad_norm": 0.0018429755000397563, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 58356718.0, "rewards/KL_reward/mean": -33.600765228271484, "rewards/KL_reward/std": 1.753105640411377, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6882086167800453, "grad_norm": 0.0004957416094839573, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 58388950.0, "rewards/KL_reward/mean": -33.10886001586914, "rewards/KL_reward/std": 0.9649978280067444, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6893424036281179, "grad_norm": 0.0045425486750900745, "learning_rate": 5e-05, "loss": 0.0032, "num_tokens": 58421038.0, "rewards/KL_reward/mean": -32.95924377441406, "rewards/KL_reward/std": 1.7486824989318848, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.015625, "epoch": 0.6904761904761905, "grad_norm": 1.0173194408416748, "learning_rate": 5e-05, "loss": -0.0048, "num_tokens": 58453096.0, "rewards/KL_reward/mean": -33.599456787109375, "rewards/KL_reward/std": 1.6784018278121948, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.691609977324263, "grad_norm": 0.0007395737338811159, "learning_rate": 5e-05, "loss": 0.0055, "num_tokens": 58485088.0, "rewards/KL_reward/mean": -33.9056282043457, "rewards/KL_reward/std": 0.8960281014442444, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6927437641723356, "grad_norm": 0.0005152480443939567, "learning_rate": 5e-05, "loss": -0.0016, "num_tokens": 58517096.0, "rewards/KL_reward/mean": -33.851016998291016, "rewards/KL_reward/std": 1.5645488500595093, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6938775510204082, "grad_norm": 0.00024765508715063334, "learning_rate": 5e-05, "loss": 0.0017, "num_tokens": 58549704.0, "rewards/KL_reward/mean": -33.67916488647461, "rewards/KL_reward/std": 1.1806282997131348, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6950113378684807, "grad_norm": 0.06450813263654709, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 58581488.0, "rewards/KL_reward/mean": -33.84139633178711, "rewards/KL_reward/std": 2.0141618251800537, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6961451247165533, "grad_norm": 0.9488609433174133, "learning_rate": 5e-05, "loss": -0.003, "num_tokens": 58613456.0, "rewards/KL_reward/mean": -32.28491973876953, "rewards/KL_reward/std": 2.4036858081817627, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6972789115646258, "grad_norm": 0.0016067641554400325, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 58645432.0, "rewards/KL_reward/mean": -33.97580337524414, "rewards/KL_reward/std": 1.25538170337677, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.6984126984126984, "grad_norm": 0.0003554086433723569, "learning_rate": 5e-05, "loss": -0.0027, "num_tokens": 58677704.0, "rewards/KL_reward/mean": -33.46821975708008, "rewards/KL_reward/std": 2.084940195083618, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.699546485260771, "grad_norm": 0.0007773024262860417, "learning_rate": 5e-05, "loss": 0.0036, "num_tokens": 58709696.0, "rewards/KL_reward/mean": -33.11626052856445, "rewards/KL_reward/std": 1.1806670427322388, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7006802721088435, "grad_norm": 0.07335913926362991, "learning_rate": 5e-05, "loss": 0.0045, "num_tokens": 58742096.0, "rewards/KL_reward/mean": -33.329498291015625, "rewards/KL_reward/std": 1.485001802444458, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7018140589569161, "grad_norm": 0.00014804053353145719, "learning_rate": 5e-05, "loss": 0.0061, "num_tokens": 58774032.0, "rewards/KL_reward/mean": -33.54657745361328, "rewards/KL_reward/std": 1.0319091081619263, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7029478458049887, "grad_norm": 0.00015110177628230304, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 58806544.0, "rewards/KL_reward/mean": -33.54651641845703, "rewards/KL_reward/std": 0.8972129225730896, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7040816326530612, "grad_norm": 0.00027474554372020066, "learning_rate": 5e-05, "loss": -0.0029, "num_tokens": 58838840.0, "rewards/KL_reward/mean": -33.46837615966797, "rewards/KL_reward/std": 0.8508355617523193, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7052154195011338, "grad_norm": 0.00023305356444325298, "learning_rate": 5e-05, "loss": 0.0024, "num_tokens": 58870776.0, "rewards/KL_reward/mean": -33.83546447753906, "rewards/KL_reward/std": 1.3228144645690918, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7063492063492064, "grad_norm": 6.870734796393663e-05, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 58901968.0, "rewards/KL_reward/mean": -33.718536376953125, "rewards/KL_reward/std": 0.7522987127304077, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7074829931972789, "grad_norm": 0.0712951123714447, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 58934136.0, "rewards/KL_reward/mean": -33.2022819519043, "rewards/KL_reward/std": 1.5445148944854736, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7086167800453514, "grad_norm": 0.5388066172599792, "learning_rate": 5e-05, "loss": 0.0049, "num_tokens": 58965536.0, "rewards/KL_reward/mean": -32.57041931152344, "rewards/KL_reward/std": 4.242175579071045, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7097505668934241, "grad_norm": 0.0003903552424162626, "learning_rate": 5e-05, "loss": 0.011, "num_tokens": 58998136.0, "rewards/KL_reward/mean": -33.62450408935547, "rewards/KL_reward/std": 1.1225509643554688, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7108843537414966, "grad_norm": 0.5541359782218933, "learning_rate": 5e-05, "loss": 0.0062, "num_tokens": 59030352.0, "rewards/KL_reward/mean": -32.1502799987793, "rewards/KL_reward/std": 3.336327314376831, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7120181405895691, "grad_norm": 0.001875443966127932, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 59062008.0, "rewards/KL_reward/mean": -33.85092544555664, "rewards/KL_reward/std": 1.4876642227172852, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7131519274376418, "grad_norm": 0.00022843039187137038, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 59094136.0, "rewards/KL_reward/mean": -33.3590087890625, "rewards/KL_reward/std": 1.20937979221344, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7142857142857143, "grad_norm": 0.00017198668501805514, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 59126424.0, "rewards/KL_reward/mean": -33.39804458618164, "rewards/KL_reward/std": 0.9631538987159729, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7154195011337868, "grad_norm": 0.058965396136045456, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 59158968.0, "rewards/KL_reward/mean": -33.300758361816406, "rewards/KL_reward/std": 1.4007177352905273, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7165532879818595, "grad_norm": 0.0004953857278451324, "learning_rate": 5e-05, "loss": 0.0075, "num_tokens": 59190792.0, "rewards/KL_reward/mean": -33.7652473449707, "rewards/KL_reward/std": 1.009839653968811, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.717687074829932, "grad_norm": 0.0001303389435634017, "learning_rate": 5e-05, "loss": 0.0039, "num_tokens": 59222240.0, "rewards/KL_reward/mean": -33.73405838012695, "rewards/KL_reward/std": 1.7354185581207275, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7188208616780045, "grad_norm": 0.3652340769767761, "learning_rate": 5e-05, "loss": 0.0045, "num_tokens": 59254552.0, "rewards/KL_reward/mean": -32.576416015625, "rewards/KL_reward/std": 2.7574236392974854, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.719954648526077, "grad_norm": 0.0008426376734860241, "learning_rate": 5e-05, "loss": 0.004, "num_tokens": 59287064.0, "rewards/KL_reward/mean": -33.13212203979492, "rewards/KL_reward/std": 1.009618878364563, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7210884353741497, "grad_norm": 0.00012571057595778257, "learning_rate": 5e-05, "loss": -0.0112, "num_tokens": 59318632.0, "rewards/KL_reward/mean": -33.55448913574219, "rewards/KL_reward/std": 0.9037834405899048, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7222222222222222, "grad_norm": 0.0003126139345113188, "learning_rate": 5e-05, "loss": -0.0126, "num_tokens": 59350408.0, "rewards/KL_reward/mean": -33.88240051269531, "rewards/KL_reward/std": 1.4632179737091064, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7233560090702947, "grad_norm": 0.0002778090420179069, "learning_rate": 5e-05, "loss": -0.0102, "num_tokens": 59382320.0, "rewards/KL_reward/mean": -33.33558654785156, "rewards/KL_reward/std": 1.2727230787277222, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7244897959183674, "grad_norm": 0.00049192103324458, "learning_rate": 5e-05, "loss": 0.0101, "num_tokens": 59414512.0, "rewards/KL_reward/mean": -33.272857666015625, "rewards/KL_reward/std": 1.0962445735931396, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7256235827664399, "grad_norm": 0.00018673698650673032, "learning_rate": 5e-05, "loss": 0.0058, "num_tokens": 59445912.0, "rewards/KL_reward/mean": -33.45282745361328, "rewards/KL_reward/std": 1.211887001991272, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7267573696145124, "grad_norm": 0.995429277420044, "learning_rate": 5e-05, "loss": 0.0034, "num_tokens": 59477952.0, "rewards/KL_reward/mean": -31.681346893310547, "rewards/KL_reward/std": 4.268650531768799, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7278911564625851, "grad_norm": 0.19380541145801544, "learning_rate": 5e-05, "loss": -0.0016, "num_tokens": 59509904.0, "rewards/KL_reward/mean": -33.75501251220703, "rewards/KL_reward/std": 1.0362346172332764, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7290249433106576, "grad_norm": 0.6822595000267029, "learning_rate": 5e-05, "loss": 0.0021, "num_tokens": 59541992.0, "rewards/KL_reward/mean": -31.853694915771484, "rewards/KL_reward/std": 4.532584190368652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7301587301587301, "grad_norm": 9.342659905087203e-05, "learning_rate": 5e-05, "loss": 0.0027, "num_tokens": 59574264.0, "rewards/KL_reward/mean": -33.14033508300781, "rewards/KL_reward/std": 1.1645630598068237, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7312925170068028, "grad_norm": 0.00038804521318525076, "learning_rate": 5e-05, "loss": 0.0033, "num_tokens": 59606184.0, "rewards/KL_reward/mean": -33.734004974365234, "rewards/KL_reward/std": 1.066764235496521, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7324263038548753, "grad_norm": 0.22279416024684906, "learning_rate": 5e-05, "loss": -0.0084, "num_tokens": 59637624.0, "rewards/KL_reward/mean": -32.14263916015625, "rewards/KL_reward/std": 6.758326053619385, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7335600907029478, "grad_norm": 0.000317032216116786, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 59670168.0, "rewards/KL_reward/mean": -33.10893249511719, "rewards/KL_reward/std": 0.9557627439498901, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7346938775510204, "grad_norm": 0.0006659679929725826, "learning_rate": 5e-05, "loss": -0.0031, "num_tokens": 59701792.0, "rewards/KL_reward/mean": -34.12440490722656, "rewards/KL_reward/std": 1.477248191833496, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.735827664399093, "grad_norm": 0.0002181259769713506, "learning_rate": 5e-05, "loss": 0.0064, "num_tokens": 59734440.0, "rewards/KL_reward/mean": -33.26518630981445, "rewards/KL_reward/std": 0.6842576265335083, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7369614512471655, "grad_norm": 0.00024293440219480544, "learning_rate": 5e-05, "loss": -0.0095, "num_tokens": 59766600.0, "rewards/KL_reward/mean": -33.562110900878906, "rewards/KL_reward/std": 1.3676056861877441, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7380952380952381, "grad_norm": 0.0003284573322162032, "learning_rate": 5e-05, "loss": -0.0108, "num_tokens": 59798256.0, "rewards/KL_reward/mean": -33.58565902709961, "rewards/KL_reward/std": 1.092898964881897, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7392290249433107, "grad_norm": 0.000712559325620532, "learning_rate": 5e-05, "loss": -0.0031, "num_tokens": 59829984.0, "rewards/KL_reward/mean": -33.616432189941406, "rewards/KL_reward/std": 1.291390299797058, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7403628117913832, "grad_norm": 0.0013158658985048532, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 59862560.0, "rewards/KL_reward/mean": -33.030364990234375, "rewards/KL_reward/std": 0.949847400188446, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.7414965986394558, "grad_norm": 0.17137093842029572, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 59893937.0, "rewards/KL_reward/mean": -33.508331298828125, "rewards/KL_reward/std": 1.0768557786941528, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7426303854875284, "grad_norm": 0.11959031224250793, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 59925505.0, "rewards/KL_reward/mean": -33.461448669433594, "rewards/KL_reward/std": 1.6048932075500488, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7437641723356009, "grad_norm": 0.0006741550751030445, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 59957553.0, "rewards/KL_reward/mean": -33.40549850463867, "rewards/KL_reward/std": 1.3017975091934204, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7448979591836735, "grad_norm": 0.05803260952234268, "learning_rate": 5e-05, "loss": 0.0039, "num_tokens": 59989185.0, "rewards/KL_reward/mean": -33.339691162109375, "rewards/KL_reward/std": 1.560107707977295, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.746031746031746, "grad_norm": 0.00023864081595093012, "learning_rate": 5e-05, "loss": 0.0048, "num_tokens": 60021105.0, "rewards/KL_reward/mean": -33.468257904052734, "rewards/KL_reward/std": 1.1290925741195679, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7471655328798186, "grad_norm": 0.2241455465555191, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 60052873.0, "rewards/KL_reward/mean": -33.317718505859375, "rewards/KL_reward/std": 1.3931663036346436, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7482993197278912, "grad_norm": 0.0003466380585450679, "learning_rate": 5e-05, "loss": -0.0013, "num_tokens": 60084865.0, "rewards/KL_reward/mean": -33.85884475708008, "rewards/KL_reward/std": 0.9919801950454712, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7494331065759637, "grad_norm": 0.008189519867300987, "learning_rate": 5e-05, "loss": -0.003, "num_tokens": 60116865.0, "rewards/KL_reward/mean": -33.21611785888672, "rewards/KL_reward/std": 1.6068731546401978, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7505668934240363, "grad_norm": 0.0010099131613969803, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 60148969.0, "rewards/KL_reward/mean": -33.85078430175781, "rewards/KL_reward/std": 1.8423516750335693, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7517006802721088, "grad_norm": 0.0002411614841548726, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 60181785.0, "rewards/KL_reward/mean": -34.116703033447266, "rewards/KL_reward/std": 1.4885939359664917, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7528344671201814, "grad_norm": 0.0003497452998999506, "learning_rate": 5e-05, "loss": 0.0062, "num_tokens": 60213737.0, "rewards/KL_reward/mean": -33.90576171875, "rewards/KL_reward/std": 1.1969181299209595, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.753968253968254, "grad_norm": 0.0003262519312556833, "learning_rate": 5e-05, "loss": 0.001, "num_tokens": 60245929.0, "rewards/KL_reward/mean": -33.390018463134766, "rewards/KL_reward/std": 0.8370320200920105, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7551020408163265, "grad_norm": 0.0005763856461271644, "learning_rate": 5e-05, "loss": -0.0021, "num_tokens": 60277121.0, "rewards/KL_reward/mean": -34.03056716918945, "rewards/KL_reward/std": 1.0610737800598145, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7562358276643991, "grad_norm": 0.0036802797112613916, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 60309265.0, "rewards/KL_reward/mean": -33.4349250793457, "rewards/KL_reward/std": 1.7730191946029663, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7573696145124716, "grad_norm": 0.0006001257570460439, "learning_rate": 5e-05, "loss": -0.0031, "num_tokens": 60340953.0, "rewards/KL_reward/mean": -33.36646270751953, "rewards/KL_reward/std": 1.1170799732208252, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.7585034013605442, "grad_norm": 0.25414907932281494, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 60373370.0, "rewards/KL_reward/mean": -33.295066833496094, "rewards/KL_reward/std": 2.0293033123016357, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7596371882086168, "grad_norm": 0.0005584516911767423, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 60405106.0, "rewards/KL_reward/mean": -33.63987731933594, "rewards/KL_reward/std": 0.9843823909759521, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7607709750566893, "grad_norm": 0.0005905695143155754, "learning_rate": 5e-05, "loss": 0.0028, "num_tokens": 60437050.0, "rewards/KL_reward/mean": -33.20232391357422, "rewards/KL_reward/std": 1.0796029567718506, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7619047619047619, "grad_norm": 0.0010460438206791878, "learning_rate": 5e-05, "loss": -0.0022, "num_tokens": 60469426.0, "rewards/KL_reward/mean": -33.624046325683594, "rewards/KL_reward/std": 1.0314666032791138, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7630385487528345, "grad_norm": 0.001069650985300541, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 60501690.0, "rewards/KL_reward/mean": -33.50665283203125, "rewards/KL_reward/std": 1.207753300666809, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.764172335600907, "grad_norm": 0.00037095643347129226, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 60534314.0, "rewards/KL_reward/mean": -32.88195037841797, "rewards/KL_reward/std": 0.6923331022262573, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7653061224489796, "grad_norm": 0.0015014632372185588, "learning_rate": 5e-05, "loss": 0.0013, "num_tokens": 60566250.0, "rewards/KL_reward/mean": -33.47536849975586, "rewards/KL_reward/std": 1.4117809534072876, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7664399092970522, "grad_norm": 0.0032663193996995687, "learning_rate": 5e-05, "loss": -0.002, "num_tokens": 60598346.0, "rewards/KL_reward/mean": -33.787513732910156, "rewards/KL_reward/std": 1.0018196105957031, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7675736961451247, "grad_norm": 0.0004160176613368094, "learning_rate": 5e-05, "loss": 0.0022, "num_tokens": 60630674.0, "rewards/KL_reward/mean": -33.78032684326172, "rewards/KL_reward/std": 0.9000056385993958, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7687074829931972, "grad_norm": 0.0012931914534419775, "learning_rate": 5e-05, "loss": 0.0009, "num_tokens": 60662610.0, "rewards/KL_reward/mean": -34.00667190551758, "rewards/KL_reward/std": 1.7070261240005493, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7698412698412699, "grad_norm": 0.2659655213356018, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 60694410.0, "rewards/KL_reward/mean": -33.678863525390625, "rewards/KL_reward/std": 1.627038836479187, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7709750566893424, "grad_norm": 0.060414332896471024, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 60726946.0, "rewards/KL_reward/mean": -33.41743469238281, "rewards/KL_reward/std": 1.603200912475586, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7721088435374149, "grad_norm": 0.0018846142338588834, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 60758514.0, "rewards/KL_reward/mean": -33.86500549316406, "rewards/KL_reward/std": 1.076326608657837, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7732426303854876, "grad_norm": 0.001117186271585524, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 60790578.0, "rewards/KL_reward/mean": -33.70912170410156, "rewards/KL_reward/std": 2.305903434753418, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7743764172335601, "grad_norm": 0.0017752180574461818, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 60821994.0, "rewards/KL_reward/mean": -33.787025451660156, "rewards/KL_reward/std": 1.4653655290603638, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7755102040816326, "grad_norm": 0.05909721180796623, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 60854154.0, "rewards/KL_reward/mean": -33.027061462402344, "rewards/KL_reward/std": 1.6789178848266602, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7766439909297053, "grad_norm": 0.0009766574949026108, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 60886138.0, "rewards/KL_reward/mean": -33.709434509277344, "rewards/KL_reward/std": 1.4155031442642212, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.7777777777777778, "grad_norm": 0.3626880645751953, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 60918075.0, "rewards/KL_reward/mean": -32.75965881347656, "rewards/KL_reward/std": 2.005737543106079, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7789115646258503, "grad_norm": 0.0031591549050062895, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 60949803.0, "rewards/KL_reward/mean": -33.69989013671875, "rewards/KL_reward/std": 1.6075519323349, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.780045351473923, "grad_norm": 0.24598188698291779, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 60981692.0, "rewards/KL_reward/mean": -32.968814849853516, "rewards/KL_reward/std": 2.120867967605591, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7811791383219955, "grad_norm": 0.003578277537599206, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 61013236.0, "rewards/KL_reward/mean": -33.44221496582031, "rewards/KL_reward/std": 1.487945318222046, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.782312925170068, "grad_norm": 0.02068578079342842, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 61045420.0, "rewards/KL_reward/mean": -33.07246398925781, "rewards/KL_reward/std": 1.2704086303710938, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.7834467120181405, "grad_norm": 0.2770971953868866, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 61077428.0, "rewards/KL_reward/mean": -33.59265899658203, "rewards/KL_reward/std": 1.986149787902832, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.7845804988662132, "grad_norm": 0.3363737463951111, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 61109413.0, "rewards/KL_reward/mean": -34.014347076416016, "rewards/KL_reward/std": 1.1279405355453491, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.015625, "epoch": 0.7857142857142857, "grad_norm": 0.39339056611061096, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61141135.0, "rewards/KL_reward/mean": -33.55498504638672, "rewards/KL_reward/std": 1.4367928504943848, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.7868480725623582, "grad_norm": 0.2678931653499603, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61173496.0, "rewards/KL_reward/mean": -32.55483627319336, "rewards/KL_reward/std": 1.5880851745605469, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.046875, "epoch": 0.7879818594104309, "grad_norm": 1.5173062086105347, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61205470.0, "rewards/KL_reward/mean": -32.813358306884766, "rewards/KL_reward/std": 3.2993099689483643, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.140625, "epoch": 0.7891156462585034, "grad_norm": 2.8795690536499023, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61237992.0, "rewards/KL_reward/mean": -33.01683044433594, "rewards/KL_reward/std": 2.0140583515167236, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.390625, "epoch": 0.7902494331065759, "grad_norm": 4.886504173278809, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61269946.0, "rewards/KL_reward/mean": -31.504554748535156, "rewards/KL_reward/std": 2.4660799503326416, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.65625, "epoch": 0.7913832199546486, "grad_norm": 4.877257347106934, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61301942.0, "rewards/KL_reward/mean": -30.575576782226562, "rewards/KL_reward/std": 1.9110974073410034, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.8359375, "epoch": 0.7925170068027211, "grad_norm": 2.782092809677124, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61334393.0, "rewards/KL_reward/mean": -30.690383911132812, "rewards/KL_reward/std": 1.3091487884521484, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.96875, "epoch": 0.7936507936507936, "grad_norm": 1.6190053224563599, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61366973.0, "rewards/KL_reward/mean": -29.413843154907227, "rewards/KL_reward/std": 2.7619032859802246, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.06629125773906708, "rewards/angle_reward/std": 0.24199791252613068, "rewards/thinking_verbosity_reward/mean": -0.000943052233196795, "rewards/thinking_verbosity_reward/std": 0.007626189850270748, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.9921875, "epoch": 0.7947845804988662, "grad_norm": 0.9022070169448853, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61399556.0, "rewards/KL_reward/mean": -29.979713439941406, "rewards/KL_reward/std": 1.8409754037857056, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 2.1640625, "epoch": 0.7959183673469388, "grad_norm": 1.317862868309021, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61431833.0, "rewards/KL_reward/mean": -28.689163208007812, "rewards/KL_reward/std": 3.999366283416748, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.09943688660860062, "rewards/angle_reward/std": 0.29072776436805725, "rewards/thinking_verbosity_reward/mean": -0.002045338973402977, "rewards/thinking_verbosity_reward/std": 0.013879266567528248, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 2.1328125, "epoch": 0.7970521541950113, "grad_norm": 1.5643107891082764, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61464122.0, "rewards/KL_reward/mean": -28.878559112548828, "rewards/KL_reward/std": 2.922321319580078, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.06629125773906708, "rewards/angle_reward/std": 0.24199791252613068, "rewards/thinking_verbosity_reward/mean": -0.000943052233196795, "rewards/thinking_verbosity_reward/std": 0.007626189850270748, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 2.484375, "epoch": 0.7981859410430839, "grad_norm": 2.655838966369629, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61496728.0, "rewards/KL_reward/mean": -26.264991760253906, "rewards/KL_reward/std": 4.358268737792969, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.18782523274421692, "rewards/angle_reward/std": 0.3916890025138855, "rewards/thinking_verbosity_reward/mean": -0.0027343749534338713, "rewards/thinking_verbosity_reward/std": 0.011413133703172207, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 2.8671875, "epoch": 0.7993197278911565, "grad_norm": 2.6706795692443848, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61529255.0, "rewards/KL_reward/mean": -23.64773178100586, "rewards/KL_reward/std": 4.189241409301758, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 3.6796875, "epoch": 0.800453514739229, "grad_norm": 2.53173565864563, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61561038.0, "rewards/KL_reward/mean": -21.3406982421875, "rewards/KL_reward/std": 5.221982955932617, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.022097086533904076, "rewards/angle_reward/std": 0.17607934772968292, "rewards/thinking_verbosity_reward/mean": -0.0011048543965443969, "rewards/thinking_verbosity_reward/std": 0.00880396831780672, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 5.890625, "epoch": 0.8015873015873016, "grad_norm": 4.612654209136963, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61593640.0, "rewards/KL_reward/mean": -16.051362991333008, "rewards/KL_reward/std": 6.481986045837402, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.18782523274421692, "rewards/angle_reward/std": 0.3916890025138855, "rewards/thinking_verbosity_reward/mean": -0.0035057389177381992, "rewards/thinking_verbosity_reward/std": 0.014961066655814648, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 8.75, "epoch": 0.8027210884353742, "grad_norm": 1.8879761695861816, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61626128.0, "rewards/KL_reward/mean": -12.419229507446289, "rewards/KL_reward/std": 6.6007513999938965, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.09943688660860062, "rewards/angle_reward/std": 0.3406151831150055, "rewards/thinking_verbosity_reward/mean": -0.002896177349612117, "rewards/thinking_verbosity_reward/std": 0.01220763847231865, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 11.8828125, "epoch": 0.8038548752834467, "grad_norm": 1.5015785694122314, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61658801.0, "rewards/KL_reward/mean": -9.83292007446289, "rewards/KL_reward/std": 6.10657262802124, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.22097086906433105, "rewards/angle_reward/std": 0.48399582505226135, "rewards/thinking_verbosity_reward/mean": -0.007983904331922531, "rewards/thinking_verbosity_reward/std": 0.022066637873649597, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 15.640625, "epoch": 0.8049886621315193, "grad_norm": 2.836428165435791, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61692459.0, "rewards/KL_reward/mean": -7.300546646118164, "rewards/KL_reward/std": 4.535953521728516, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.37565046548843384, "rewards/angle_reward/std": 0.5169374942779541, "rewards/thinking_verbosity_reward/mean": -0.010226922109723091, "rewards/thinking_verbosity_reward/std": 0.02758130244910717, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 20.21875, "epoch": 0.8061224489795918, "grad_norm": 2.2767698764801025, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61726919.0, "rewards/KL_reward/mean": -5.976463794708252, "rewards/KL_reward/std": 4.255782127380371, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.408796101808548, "rewards/angle_reward/std": 0.522029459476471, "rewards/thinking_verbosity_reward/mean": -0.010747408494353294, "rewards/thinking_verbosity_reward/std": 0.027381112799048424, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 45.9921875, "epoch": 0.8072562358276644, "grad_norm": 2.9280054569244385, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61764870.0, "rewards/KL_reward/mean": -6.136072158813477, "rewards/KL_reward/std": 4.335809707641602, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.3314563035964966, "rewards/angle_reward/std": 0.5746446847915649, "rewards/thinking_verbosity_reward/mean": -0.046691492199897766, "rewards/thinking_verbosity_reward/std": 0.24875198304653168, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 90.9140625, "epoch": 0.808390022675737, "grad_norm": 2.419306516647339, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61808875.0, "rewards/KL_reward/mean": -5.737112045288086, "rewards/KL_reward/std": 5.006260871887207, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.30935922265052795, "rewards/angle_reward/std": 0.6131755709648132, "rewards/thinking_verbosity_reward/mean": -0.10689996182918549, "rewards/thinking_verbosity_reward/std": 0.35072776675224304, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 85.3515625, "epoch": 0.8095238095238095, "grad_norm": 4.568090915679932, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 61852472.0, "rewards/KL_reward/mean": -5.3782958984375, "rewards/KL_reward/std": 4.274779796600342, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.3314563035964966, "rewards/angle_reward/std": 0.5465532541275024, "rewards/thinking_verbosity_reward/mean": -0.10199468582868576, "rewards/thinking_verbosity_reward/std": 0.35763248801231384, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 146.1015625, "epoch": 0.8106575963718821, "grad_norm": 2.923835515975952, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61903389.0, "rewards/KL_reward/mean": -5.380932807922363, "rewards/KL_reward/std": 4.547464370727539, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.19887377321720123, "rewards/angle_reward/std": 0.681230366230011, "rewards/thinking_verbosity_reward/mean": -0.2453625500202179, "rewards/thinking_verbosity_reward/std": 0.4244639575481415, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 195.171875, "epoch": 0.8117913832199547, "grad_norm": 0.8407360315322876, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 61960347.0, "rewards/KL_reward/mean": -3.7000811100006104, "rewards/KL_reward/std": 3.552907943725586, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.20992231369018555, "rewards/angle_reward/std": 0.6041808128356934, "rewards/thinking_verbosity_reward/mean": -0.23007535934448242, "rewards/thinking_verbosity_reward/std": 0.4760555028915405, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 352.546875, "epoch": 0.8129251700680272, "grad_norm": 2.3982837200164795, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 62037737.0, "rewards/KL_reward/mean": -3.6907198429107666, "rewards/KL_reward/std": 3.331489324569702, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.15467959642410278, "rewards/angle_reward/std": 0.6695720553398132, "rewards/thinking_verbosity_reward/mean": -0.4037805199623108, "rewards/thinking_verbosity_reward/std": 0.6441807746887207, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 186.6953125, "epoch": 0.8140589569160998, "grad_norm": 1.133590579032898, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 62093306.0, "rewards/KL_reward/mean": -3.6116247177124023, "rewards/KL_reward/std": 2.6408379077911377, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.19887377321720123, "rewards/angle_reward/std": 0.6577072143554688, "rewards/thinking_verbosity_reward/mean": -0.21827402710914612, "rewards/thinking_verbosity_reward/std": 0.47493577003479004, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 302.1328125, "epoch": 0.8151927437641724, "grad_norm": 2.1491713523864746, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 62164507.0, "rewards/KL_reward/mean": -3.0284664630889893, "rewards/KL_reward/std": 2.8550503253936768, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.4087960720062256, "rewards/angle_reward/std": 0.5792295336723328, "rewards/thinking_verbosity_reward/mean": -0.2621035575866699, "rewards/thinking_verbosity_reward/std": 0.6942629218101501, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 233.4296875, "epoch": 0.8163265306122449, "grad_norm": 0.5096162557601929, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 62226674.0, "rewards/KL_reward/mean": -2.5021750926971436, "rewards/KL_reward/std": 1.2480705976486206, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.3425048589706421, "rewards/angle_reward/std": 0.5680770874023438, "rewards/thinking_verbosity_reward/mean": -0.17499490082263947, "rewards/thinking_verbosity_reward/std": 0.5807626247406006, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 170.0390625, "epoch": 0.8174603174603174, "grad_norm": 1.7022823095321655, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 62279327.0, "rewards/KL_reward/mean": -2.7811903953552246, "rewards/KL_reward/std": 2.0388731956481934, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.39774757623672485, "rewards/angle_reward/std": 0.5305619835853577, "rewards/thinking_verbosity_reward/mean": -0.13159838318824768, "rewards/thinking_verbosity_reward/std": 0.5158379673957825, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 165.59375, "epoch": 0.81859410430839, "grad_norm": 1.1229782104492188, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 62333123.0, "rewards/KL_reward/mean": -2.5608346462249756, "rewards/KL_reward/std": 1.7398568391799927, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.38669902086257935, "rewards/angle_reward/std": 0.5672101378440857, "rewards/thinking_verbosity_reward/mean": -0.13085666298866272, "rewards/thinking_verbosity_reward/std": 0.4741191267967224, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 394.4765625, "epoch": 0.8197278911564626, "grad_norm": 1.1178206205368042, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 62415904.0, "rewards/KL_reward/mean": -2.433047294616699, "rewards/KL_reward/std": 2.6464271545410156, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.3314563035964966, "rewards/angle_reward/std": 0.5746446251869202, "rewards/thinking_verbosity_reward/mean": -0.3151102662086487, "rewards/thinking_verbosity_reward/std": 0.7379494309425354, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 414.7890625, "epoch": 0.8208616780045351, "grad_norm": 3.102323293685913, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 62500749.0, "rewards/KL_reward/mean": -2.4026496410369873, "rewards/KL_reward/std": 2.3765244483947754, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.3425048291683197, "rewards/angle_reward/std": 0.5951534509658813, "rewards/thinking_verbosity_reward/mean": -0.33660024404525757, "rewards/thinking_verbosity_reward/std": 0.7675768733024597, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 844.5078125, "epoch": 0.8219954648526077, "grad_norm": 0.27251550555229187, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 62640990.0, "rewards/KL_reward/mean": -1.4234731197357178, "rewards/KL_reward/std": 1.720995545387268, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.1657281517982483, "rewards/angle_reward/std": 0.642855167388916, "rewards/thinking_verbosity_reward/mean": -0.7711927890777588, "rewards/thinking_verbosity_reward/std": 0.8828684091567993, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1464.8359375, "epoch": 0.8231292517006803, "grad_norm": 0.2615896761417389, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 62860689.0, "rewards/KL_reward/mean": -0.6533093452453613, "rewards/KL_reward/std": 1.104088544845581, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.011048540472984314, "rewards/angle_reward/std": 0.7097985744476318, "rewards/thinking_verbosity_reward/mean": -1.3578754663467407, "rewards/thinking_verbosity_reward/std": 0.7664743661880493, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1577.6015625, "epoch": 0.8242630385487528, "grad_norm": 0.5463308095932007, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 63094590.0, "rewards/KL_reward/mean": -0.7473901510238647, "rewards/KL_reward/std": 1.843743920326233, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314563259482384, "rewards/angle_reward/std": 0.7091048955917358, "rewards/thinking_verbosity_reward/mean": -1.3820425271987915, "rewards/thinking_verbosity_reward/std": 0.831028163433075, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1776.9375, "epoch": 0.8253968253968254, "grad_norm": 0.05059307813644409, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 63354686.0, "rewards/KL_reward/mean": -0.37151604890823364, "rewards/KL_reward/std": 0.4577194154262543, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.04419416934251785, "rewards/angle_reward/std": 0.7084973454475403, "rewards/thinking_verbosity_reward/mean": -1.624656081199646, "rewards/thinking_verbosity_reward/std": 0.5813239216804504, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1764.0078125, "epoch": 0.826530612244898, "grad_norm": 0.01426609791815281, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 63612815.0, "rewards/KL_reward/mean": -0.34279125928878784, "rewards/KL_reward/std": 0.4354683458805084, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.022097088396549225, "rewards/angle_reward/std": 0.709538459777832, "rewards/thinking_verbosity_reward/mean": -1.6274614334106445, "rewards/thinking_verbosity_reward/std": 0.5408163666725159, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1806.03125, "epoch": 0.8276643990929705, "grad_norm": 0.011248442344367504, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 63876291.0, "rewards/KL_reward/mean": -0.29520583152770996, "rewards/KL_reward/std": 0.3265356421470642, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.03314562886953354, "rewards/angle_reward/std": 0.7091048955917358, "rewards/thinking_verbosity_reward/mean": -1.656254768371582, "rewards/thinking_verbosity_reward/std": 0.5175119638442993, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1666.96875, "epoch": 0.828798185941043, "grad_norm": 0.011705402284860611, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 64121343.0, "rewards/KL_reward/mean": -0.3267815411090851, "rewards/KL_reward/std": 0.5070149302482605, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.022097084671258926, "rewards/angle_reward/std": 0.709538459777832, "rewards/thinking_verbosity_reward/mean": -1.5810213088989258, "rewards/thinking_verbosity_reward/std": 0.5288156867027283, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1932.1640625, "epoch": 0.8299319727891157, "grad_norm": 0.029215455055236816, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 64401052.0, "rewards/KL_reward/mean": -0.35986757278442383, "rewards/KL_reward/std": 0.8738176226615906, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.12153397500514984, "rewards/angle_reward/std": 0.6993212699890137, "rewards/thinking_verbosity_reward/mean": -1.7087500095367432, "rewards/thinking_verbosity_reward/std": 0.5540984272956848, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1894.4765625, "epoch": 0.8310657596371882, "grad_norm": 0.014189718291163445, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 64675121.0, "rewards/KL_reward/mean": -0.33786386251449585, "rewards/KL_reward/std": 0.568859875202179, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.13258251547813416, "rewards/angle_reward/std": 0.6972951292991638, "rewards/thinking_verbosity_reward/mean": -1.6881887912750244, "rewards/thinking_verbosity_reward/std": 0.5704836845397949, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1913.125, "epoch": 0.8321995464852607, "grad_norm": 0.3056281805038452, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 64952009.0, "rewards/KL_reward/mean": -0.5387207269668579, "rewards/KL_reward/std": 1.6692149639129639, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.13258251547813416, "rewards/angle_reward/std": 0.6972951292991638, "rewards/thinking_verbosity_reward/mean": -1.685068130493164, "rewards/thinking_verbosity_reward/std": 0.5932014584541321, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1809.8359375, "epoch": 0.8333333333333334, "grad_norm": 0.011273681186139584, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 65215924.0, "rewards/KL_reward/mean": -0.28363704681396484, "rewards/KL_reward/std": 0.3439120054244995, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.03314562886953354, "rewards/angle_reward/std": 0.7091048955917358, "rewards/thinking_verbosity_reward/mean": -1.6670418977737427, "rewards/thinking_verbosity_reward/std": 0.5123423337936401, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1822.828125, "epoch": 0.8344671201814059, "grad_norm": 0.011458742432296276, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 65480478.0, "rewards/KL_reward/mean": -0.3005499839782715, "rewards/KL_reward/std": 0.38731321692466736, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.03314562886953354, "rewards/angle_reward/std": 0.7091048955917358, "rewards/thinking_verbosity_reward/mean": -1.6545227766036987, "rewards/thinking_verbosity_reward/std": 0.5496739745140076, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1889.734375, "epoch": 0.8356009070294784, "grad_norm": 0.009957044385373592, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 65754076.0, "rewards/KL_reward/mean": -0.28186577558517456, "rewards/KL_reward/std": 0.3413471579551697, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.07733979821205139, "rewards/angle_reward/std": 0.7056263089179993, "rewards/thinking_verbosity_reward/mean": -1.696712613105774, "rewards/thinking_verbosity_reward/std": 0.5212931632995605, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1938.4375, "epoch": 0.8367346938775511, "grad_norm": 0.00860871933400631, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 66033868.0, "rewards/KL_reward/mean": -0.25816720724105835, "rewards/KL_reward/std": 0.30099010467529297, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.04419417306780815, "rewards/angle_reward/std": 0.7084973454475403, "rewards/thinking_verbosity_reward/mean": -1.7327439785003662, "rewards/thinking_verbosity_reward/std": 0.5168812870979309, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1825.859375, "epoch": 0.8378684807256236, "grad_norm": 0.013760825619101524, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 66298474.0, "rewards/KL_reward/mean": -0.3341813087463379, "rewards/KL_reward/std": 0.7779108285903931, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.07733979821205139, "rewards/angle_reward/std": 0.7056263089179993, "rewards/thinking_verbosity_reward/mean": -1.6584984064102173, "rewards/thinking_verbosity_reward/std": 0.54533451795578, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 2007.4453125, "epoch": 0.8390022675736961, "grad_norm": 0.23512808978557587, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 66587291.0, "rewards/KL_reward/mean": -0.5901498794555664, "rewards/KL_reward/std": 2.1927568912506104, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.19887377321720123, "rewards/angle_reward/std": 0.681230366230011, "rewards/thinking_verbosity_reward/mean": -1.7421135902404785, "rewards/thinking_verbosity_reward/std": 0.5695531964302063, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1833.0703125, "epoch": 0.8401360544217688, "grad_norm": 0.19070129096508026, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 66853860.0, "rewards/KL_reward/mean": -0.41403061151504517, "rewards/KL_reward/std": 1.0047106742858887, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.07733979821205139, "rewards/angle_reward/std": 0.7056263089179993, "rewards/thinking_verbosity_reward/mean": -1.6566379070281982, "rewards/thinking_verbosity_reward/std": 0.5692102909088135, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1909.1328125, "epoch": 0.8412698412698413, "grad_norm": 0.01683916337788105, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 67129485.0, "rewards/KL_reward/mean": -0.37234920263290405, "rewards/KL_reward/std": 1.2858551740646362, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.0883883386850357, "rewards/angle_reward/std": 0.7043173909187317, "rewards/thinking_verbosity_reward/mean": -1.7029697895050049, "rewards/thinking_verbosity_reward/std": 0.5322949290275574, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1864.59375, "epoch": 0.8424036281179138, "grad_norm": 0.2072633057832718, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 67400073.0, "rewards/KL_reward/mean": -0.38009560108184814, "rewards/KL_reward/std": 1.0967984199523926, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.0883883461356163, "rewards/angle_reward/std": 0.7043173909187317, "rewards/thinking_verbosity_reward/mean": -1.6811352968215942, "rewards/thinking_verbosity_reward/std": 0.5584313869476318, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1785.5, "epoch": 0.8435374149659864, "grad_norm": 0.15221014618873596, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 67661073.0, "rewards/KL_reward/mean": -0.8090833425521851, "rewards/KL_reward/std": 2.7380833625793457, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.055242717266082764, "rewards/angle_reward/std": 0.7077155113220215, "rewards/thinking_verbosity_reward/mean": -1.6210336685180664, "rewards/thinking_verbosity_reward/std": 0.6052948832511902, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1860.8046875, "epoch": 0.844671201814059, "grad_norm": 0.2941654324531555, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 67931328.0, "rewards/KL_reward/mean": -0.4073900580406189, "rewards/KL_reward/std": 1.372942566871643, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.0883883461356163, "rewards/angle_reward/std": 0.7043173909187317, "rewards/thinking_verbosity_reward/mean": -1.6936153173446655, "rewards/thinking_verbosity_reward/std": 0.5113928318023682, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1861.3046875, "epoch": 0.8458049886621315, "grad_norm": 0.07613295316696167, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 68201767.0, "rewards/KL_reward/mean": -0.5628873705863953, "rewards/KL_reward/std": 2.050913095474243, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.09943689405918121, "rewards/angle_reward/std": 0.7028310298919678, "rewards/thinking_verbosity_reward/mean": -1.6811420917510986, "rewards/thinking_verbosity_reward/std": 0.618983805179596, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1708.703125, "epoch": 0.8469387755102041, "grad_norm": 0.5091060400009155, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 68452281.0, "rewards/KL_reward/mean": -0.9779430627822876, "rewards/KL_reward/std": 3.240374803543091, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.09943688660860062, "rewards/angle_reward/std": 0.7028310298919678, "rewards/thinking_verbosity_reward/mean": -1.5964150428771973, "rewards/thinking_verbosity_reward/std": 0.6059854030609131, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1769.890625, "epoch": 0.8480725623582767, "grad_norm": 0.3866634666919708, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 68710355.0, "rewards/KL_reward/mean": -0.5445457696914673, "rewards/KL_reward/std": 1.5668498277664185, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.03314562886953354, "rewards/angle_reward/std": 0.7091048955917358, "rewards/thinking_verbosity_reward/mean": -1.6147034168243408, "rewards/thinking_verbosity_reward/std": 0.6311928033828735, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1770.3125, "epoch": 0.8492063492063492, "grad_norm": 0.23417192697525024, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 68969363.0, "rewards/KL_reward/mean": -0.6993677616119385, "rewards/KL_reward/std": 2.2965004444122314, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.06629125773906708, "rewards/angle_reward/std": 0.7067587375640869, "rewards/thinking_verbosity_reward/mean": -1.6513805389404297, "rewards/thinking_verbosity_reward/std": 0.6273873448371887, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1711.46875, "epoch": 0.8503401360544217, "grad_norm": 0.4115210175514221, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 69220487.0, "rewards/KL_reward/mean": -1.3880504369735718, "rewards/KL_reward/std": 3.688795804977417, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.0883883461356163, "rewards/angle_reward/std": 0.7043173909187317, "rewards/thinking_verbosity_reward/mean": -1.5875732898712158, "rewards/thinking_verbosity_reward/std": 0.7431985139846802, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1631.078125, "epoch": 0.8514739229024944, "grad_norm": 0.6005513668060303, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 69460737.0, "rewards/KL_reward/mean": -1.3629591464996338, "rewards/KL_reward/std": 3.653020143508911, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.04419417306780815, "rewards/angle_reward/std": 0.7084973454475403, "rewards/thinking_verbosity_reward/mean": -1.5267325639724731, "rewards/thinking_verbosity_reward/std": 0.7454755902290344, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1621.4609375, "epoch": 0.8526077097505669, "grad_norm": 0.5353714823722839, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 69700668.0, "rewards/KL_reward/mean": -1.4000484943389893, "rewards/KL_reward/std": 3.486201524734497, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/angle_reward/mean": -0.04390813782811165, "rewards/angle_reward/std": 0.7173286080360413, "rewards/thinking_verbosity_reward/mean": -1.5657247304916382, "rewards/thinking_verbosity_reward/std": 0.778367817401886, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1548.1796875, "epoch": 0.8537414965986394, "grad_norm": 1.4211868047714233, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 69930411.0, "rewards/KL_reward/mean": -1.7806217670440674, "rewards/KL_reward/std": 4.288891315460205, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/angle_reward/mean": -0.031445786356925964, "rewards/angle_reward/std": 0.7157651782035828, "rewards/thinking_verbosity_reward/mean": -1.5117034912109375, "rewards/thinking_verbosity_reward/std": 0.8444975018501282, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1287.015625, "epoch": 0.854875283446712, "grad_norm": 0.9926010370254517, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70126653.0, "rewards/KL_reward/mean": -2.841733455657959, "rewards/KL_reward/std": 5.6521501541137695, "rewards/accuracy_reward/mean": 0.0078125, "rewards/accuracy_reward/std": 0.0883883461356163, "rewards/angle_reward/mean": 0.020758455619215965, "rewards/angle_reward/std": 0.7000128626823425, "rewards/thinking_verbosity_reward/mean": -1.3156667947769165, "rewards/thinking_verbosity_reward/std": 0.8615114688873291, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1056.078125, "epoch": 0.8560090702947846, "grad_norm": 1.198169469833374, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 70293687.0, "rewards/KL_reward/mean": -4.479970932006836, "rewards/KL_reward/std": 6.986310958862305, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.05524270981550217, "rewards/angle_reward/std": 0.7077155113220215, "rewards/thinking_verbosity_reward/mean": -1.1392841339111328, "rewards/thinking_verbosity_reward/std": 0.927627444267273, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 541.9453125, "epoch": 0.8571428571428571, "grad_norm": 1.0973591804504395, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70395168.0, "rewards/KL_reward/mean": -9.952540397644043, "rewards/KL_reward/std": 8.075974464416504, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.2651650309562683, "rewards/angle_reward/std": 0.6580811738967896, "rewards/thinking_verbosity_reward/mean": -0.6300583481788635, "rewards/thinking_verbosity_reward/std": 0.8170543909072876, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 462.8046875, "epoch": 0.8582766439909297, "grad_norm": 1.203218936920166, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70486183.0, "rewards/KL_reward/mean": -11.928443908691406, "rewards/KL_reward/std": 7.501312732696533, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.3314563035964966, "rewards/angle_reward/std": 0.6270634531974792, "rewards/thinking_verbosity_reward/mean": -0.5203590393066406, "rewards/thinking_verbosity_reward/std": 0.7906116843223572, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 296.5703125, "epoch": 0.8594104308390023, "grad_norm": 2.216782569885254, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70556584.0, "rewards/KL_reward/mean": -12.616353988647461, "rewards/KL_reward/std": 7.149302959442139, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.22097086906433105, "rewards/angle_reward/std": 0.5733586549758911, "rewards/thinking_verbosity_reward/mean": -0.3910168409347534, "rewards/thinking_verbosity_reward/std": 0.6347672939300537, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 292.3203125, "epoch": 0.8605442176870748, "grad_norm": 0.9063274264335632, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 70625433.0, "rewards/KL_reward/mean": -13.474018096923828, "rewards/KL_reward/std": 7.214576244354248, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.2651650309562683, "rewards/angle_reward/std": 0.5818785429000854, "rewards/thinking_verbosity_reward/mean": -0.3680382966995239, "rewards/thinking_verbosity_reward/std": 0.594377338886261, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 165.984375, "epoch": 0.8616780045351474, "grad_norm": 0.2434951364994049, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70678599.0, "rewards/KL_reward/mean": -14.889543533325195, "rewards/KL_reward/std": 5.734178066253662, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.3425048291683197, "rewards/angle_reward/std": 0.5096268057823181, "rewards/thinking_verbosity_reward/mean": -0.24580667912960052, "rewards/thinking_verbosity_reward/std": 0.48874032497406006, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 93.9765625, "epoch": 0.86281179138322, "grad_norm": 0.23471392691135406, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70722692.0, "rewards/KL_reward/mean": -15.928576469421387, "rewards/KL_reward/std": 4.604101657867432, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.23201939463615417, "rewards/angle_reward/std": 0.44464775919914246, "rewards/thinking_verbosity_reward/mean": -0.1656557023525238, "rewards/thinking_verbosity_reward/std": 0.35872092843055725, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 74.8515625, "epoch": 0.8639455782312925, "grad_norm": 0.3464663624763489, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70764873.0, "rewards/KL_reward/mean": -15.698003768920898, "rewards/KL_reward/std": 4.995001316070557, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.2541164755821228, "rewards/angle_reward/std": 0.46731242537498474, "rewards/thinking_verbosity_reward/mean": -0.163039430975914, "rewards/thinking_verbosity_reward/std": 0.31206098198890686, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 22.109375, "epoch": 0.8650793650793651, "grad_norm": 0.16134105622768402, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70799663.0, "rewards/KL_reward/mean": -16.368640899658203, "rewards/KL_reward/std": 3.071010112762451, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.15467959642410278, "rewards/angle_reward/std": 0.36519327759742737, "rewards/thinking_verbosity_reward/mean": -0.09887511283159256, "rewards/thinking_verbosity_reward/std": 0.16042843461036682, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 49.7890625, "epoch": 0.8662131519274376, "grad_norm": 0.19804765284061432, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 70837868.0, "rewards/KL_reward/mean": -16.30763053894043, "rewards/KL_reward/std": 4.001601219177246, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.2651650309562683, "rewards/angle_reward/std": 0.42556124925613403, "rewards/thinking_verbosity_reward/mean": -0.12632089853286743, "rewards/thinking_verbosity_reward/std": 0.25668248534202576, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 32.21875, "epoch": 0.8673469387755102, "grad_norm": 0.18399427831172943, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70873632.0, "rewards/KL_reward/mean": -16.83941650390625, "rewards/KL_reward/std": 2.848832130432129, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.13258251547813416, "rewards/angle_reward/std": 0.3290405869483948, "rewards/thinking_verbosity_reward/mean": -0.10092172026634216, "rewards/thinking_verbosity_reward/std": 0.20551058650016785, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 27.859375, "epoch": 0.8684807256235828, "grad_norm": 0.14216119050979614, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70909534.0, "rewards/KL_reward/mean": -17.146116256713867, "rewards/KL_reward/std": 2.3291776180267334, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.09943688660860062, "rewards/angle_reward/std": 0.29072776436805725, "rewards/thinking_verbosity_reward/mean": -0.0937347561120987, "rewards/thinking_verbosity_reward/std": 0.1904277354478836, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 27.328125, "epoch": 0.8696145124716553, "grad_norm": 0.07388874888420105, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70944712.0, "rewards/KL_reward/mean": -16.985986709594727, "rewards/KL_reward/std": 2.9843196868896484, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0883883461356163, "rewards/angle_reward/std": 0.2943028509616852, "rewards/thinking_verbosity_reward/mean": -0.10031235218048096, "rewards/thinking_verbosity_reward/std": 0.18606983125209808, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 11.6171875, "epoch": 0.8707482993197279, "grad_norm": 0.023837469518184662, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 70978007.0, "rewards/KL_reward/mean": -17.148460388183594, "rewards/KL_reward/std": 2.3115603923797607, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.09943688660860062, "rewards/angle_reward/std": 0.29072776436805725, "rewards/thinking_verbosity_reward/mean": -0.08404825627803802, "rewards/thinking_verbosity_reward/std": 0.10351286828517914, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 20.359375, "epoch": 0.8718820861678005, "grad_norm": 0.01366042997688055, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 71012933.0, "rewards/KL_reward/mean": -17.068466186523438, "rewards/KL_reward/std": 1.6161787509918213, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0847688764333725, "rewards/thinking_verbosity_reward/std": 0.15905039012432098, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 13.5546875, "epoch": 0.873015873015873, "grad_norm": 0.15407995879650116, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71046300.0, "rewards/KL_reward/mean": -16.896625518798828, "rewards/KL_reward/std": 2.2293224334716797, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.13258251547813416, "rewards/angle_reward/std": 0.3290405869483948, "rewards/thinking_verbosity_reward/mean": -0.08467651158571243, "rewards/thinking_verbosity_reward/std": 0.11787908524274826, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 5.390625, "epoch": 0.8741496598639455, "grad_norm": 7.8302001953125, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 71078734.0, "rewards/KL_reward/mean": -17.01534080505371, "rewards/KL_reward/std": 2.0076377391815186, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.06629125773906708, "rewards/angle_reward/std": 0.3486475646495819, "rewards/thinking_verbosity_reward/mean": -0.07630415260791779, "rewards/thinking_verbosity_reward/std": 0.03890657052397728, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.8752834467120182, "grad_norm": 0.040065620094537735, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71111142.0, "rewards/KL_reward/mean": -16.999956130981445, "rewards/KL_reward/std": 0.7916744947433472, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.0707106813788414, "rewards/thinking_verbosity_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 13.7265625, "epoch": 0.8764172335600907, "grad_norm": 0.942425549030304, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71144795.0, "rewards/KL_reward/mean": -16.707111358642578, "rewards/KL_reward/std": 3.559162139892578, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.022097084671258926, "rewards/angle_reward/std": 0.39621734619140625, "rewards/thinking_verbosity_reward/mean": -0.08929823338985443, "rewards/thinking_verbosity_reward/std": 0.11558838188648224, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.0, "epoch": 0.8775510204081632, "grad_norm": 0.011615153402090073, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 71176971.0, "rewards/KL_reward/mean": -17.427825927734375, "rewards/KL_reward/std": 0.44023942947387695, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": -0.0707106813788414, "rewards/thinking_verbosity_reward/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 12.203125, "epoch": 0.8786848072562359, "grad_norm": 0.8019652962684631, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 71210461.0, "rewards/KL_reward/mean": -17.166763305664062, "rewards/KL_reward/std": 1.8365185260772705, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.2509823143482208, "rewards/thinking_verbosity_reward/mean": -0.07997027784585953, "rewards/thinking_verbosity_reward/std": 0.11123532056808472, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 15.296875, "epoch": 0.8798185941043084, "grad_norm": 0.6402688026428223, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 71244547.0, "rewards/KL_reward/mean": -17.005334854125977, "rewards/KL_reward/std": 2.8297479152679443, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.022097086533904076, "rewards/angle_reward/std": 0.3065877854824066, "rewards/thinking_verbosity_reward/mean": -0.08827764540910721, "rewards/thinking_verbosity_reward/std": 0.12712766230106354, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.1640625, "epoch": 0.8809523809523809, "grad_norm": 0.09513384848833084, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 71276896.0, "rewards/KL_reward/mean": -17.158042907714844, "rewards/KL_reward/std": 1.2754546403884888, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.07172074913978577, "rewards/thinking_verbosity_reward/std": 0.011427669785916805, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 5.09375, "epoch": 0.8820861678004536, "grad_norm": 2.8279929161071777, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71308932.0, "rewards/KL_reward/mean": -17.66575050354004, "rewards/KL_reward/std": 3.2178332805633545, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.011048544198274612, "rewards/angle_reward/std": 0.39668282866477966, "rewards/thinking_verbosity_reward/mean": -0.07160855084657669, "rewards/thinking_verbosity_reward/std": 0.04149220883846283, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 4.9296875, "epoch": 0.8832199546485261, "grad_norm": 5.776390075683594, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71341827.0, "rewards/KL_reward/mean": -19.910785675048828, "rewards/KL_reward/std": 6.193860054016113, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": -0.0883883386850357, "rewards/angle_reward/std": 0.5818785429000854, "rewards/thinking_verbosity_reward/mean": -0.0617288276553154, "rewards/thinking_verbosity_reward/std": 0.05266613885760307, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 50.2578125, "epoch": 0.8843537414965986, "grad_norm": 6.9208598136901855, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71380524.0, "rewards/KL_reward/mean": -24.811588287353516, "rewards/KL_reward/std": 8.277115821838379, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.14363105595111847, "rewards/angle_reward/std": 0.6720480918884277, "rewards/thinking_verbosity_reward/mean": -0.07564210891723633, "rewards/thinking_verbosity_reward/std": 0.34243935346603394, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.5859375, "epoch": 0.8854875283446711, "grad_norm": 4.5248003005981445, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71412351.0, "rewards/KL_reward/mean": -30.427478790283203, "rewards/KL_reward/std": 7.362793922424316, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.3425048589706421, "rewards/angle_reward/std": 0.5680770874023438, "rewards/thinking_verbosity_reward/mean": -0.014322892762720585, "rewards/thinking_verbosity_reward/std": 0.030253706499934196, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.21875, "epoch": 0.8866213151927438, "grad_norm": 1.312471866607666, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71444915.0, "rewards/KL_reward/mean": -31.381324768066406, "rewards/KL_reward/std": 4.721668243408203, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.14363107085227966, "rewards/angle_reward/std": 0.3697133958339691, "rewards/thinking_verbosity_reward/mean": -0.004610119387507439, "rewards/thinking_verbosity_reward/std": 0.02076861448585987, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 25.1015625, "epoch": 0.8877551020408163, "grad_norm": 1.2358919382095337, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71479432.0, "rewards/KL_reward/mean": -32.86699676513672, "rewards/KL_reward/std": 4.977630615234375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.19887377321720123, "rewards/angle_reward/std": 0.38615304231643677, "rewards/thinking_verbosity_reward/mean": -0.02466108277440071, "rewards/thinking_verbosity_reward/std": 0.24514424800872803, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.03125, "epoch": 0.8888888888888888, "grad_norm": 0.6902227401733398, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71511756.0, "rewards/KL_reward/mean": -32.74382019042969, "rewards/KL_reward/std": 2.933997631072998, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.06629125773906708, "rewards/angle_reward/std": 0.24199791252613068, "rewards/thinking_verbosity_reward/mean": -0.0011048543965443969, "rewards/thinking_verbosity_reward/std": 0.00880396831780672, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0859375, "epoch": 0.8900226757369615, "grad_norm": 0.7480749487876892, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71543295.0, "rewards/KL_reward/mean": -32.922210693359375, "rewards/KL_reward/std": 3.685519218444824, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.13258251547813416, "rewards/angle_reward/std": 0.3290405869483948, "rewards/thinking_verbosity_reward/mean": -0.002333864104002714, "rewards/thinking_verbosity_reward/std": 0.013102501630783081, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.078125, "epoch": 0.891156462585034, "grad_norm": 1.5596857070922852, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 71575569.0, "rewards/KL_reward/mean": -32.251678466796875, "rewards/KL_reward/std": 4.577408790588379, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.12153397500514984, "rewards/angle_reward/std": 0.33331283926963806, "rewards/thinking_verbosity_reward/mean": -0.002762136049568653, "rewards/thinking_verbosity_reward/std": 0.013753578998148441, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0703125, "epoch": 0.8922902494331065, "grad_norm": 1.3572076559066772, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71607450.0, "rewards/KL_reward/mean": -32.8837776184082, "rewards/KL_reward/std": 3.814033269882202, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.1657281517982483, "rewards/angle_reward/std": 0.3602752089500427, "rewards/thinking_verbosity_reward/mean": -0.0024385314900428057, "rewards/thinking_verbosity_reward/std": 0.012308008037507534, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.8934240362811792, "grad_norm": 0.04590372368693352, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 71639386.0, "rewards/KL_reward/mean": -33.47450637817383, "rewards/KL_reward/std": 1.6878197193145752, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.8945578231292517, "grad_norm": 1.8047031164169312, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71671795.0, "rewards/KL_reward/mean": -33.27348327636719, "rewards/KL_reward/std": 1.5237458944320679, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.8956916099773242, "grad_norm": 0.055271029472351074, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 71703491.0, "rewards/KL_reward/mean": -33.1529655456543, "rewards/KL_reward/std": 2.925701141357422, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.015625, "epoch": 0.8968253968253969, "grad_norm": 0.25706303119659424, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 71735829.0, "rewards/KL_reward/mean": -33.68821716308594, "rewards/KL_reward/std": 2.3951783180236816, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.8979591836734694, "grad_norm": 0.011360935866832733, "learning_rate": 5e-05, "loss": 0.0007, "num_tokens": 71768021.0, "rewards/KL_reward/mean": -33.742801666259766, "rewards/KL_reward/std": 0.9701665043830872, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.8990929705215419, "grad_norm": 0.003016524715349078, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71800237.0, "rewards/KL_reward/mean": -33.488868713378906, "rewards/KL_reward/std": 0.9943328499794006, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0234375, "epoch": 0.9002267573696145, "grad_norm": 1.0178414583206177, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 71832376.0, "rewards/KL_reward/mean": -33.524112701416016, "rewards/KL_reward/std": 1.811699628829956, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 23.578125, "epoch": 0.9013605442176871, "grad_norm": 0.10701996088027954, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 71867634.0, "rewards/KL_reward/mean": -33.605445861816406, "rewards/KL_reward/std": 3.248577356338501, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.021003132686018944, "rewards/thinking_verbosity_reward/std": 0.23762330412864685, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9024943310657596, "grad_norm": 0.019463684409856796, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 71899778.0, "rewards/KL_reward/mean": -34.059669494628906, "rewards/KL_reward/std": 0.766512393951416, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9036281179138322, "grad_norm": 0.15101948380470276, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 71931714.0, "rewards/KL_reward/mean": -33.3520622253418, "rewards/KL_reward/std": 1.259179949760437, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9047619047619048, "grad_norm": 0.005296964664012194, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 71964066.0, "rewards/KL_reward/mean": -33.70747375488281, "rewards/KL_reward/std": 2.183056354522705, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9058956916099773, "grad_norm": 0.025313332676887512, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 71995730.0, "rewards/KL_reward/mean": -34.05757141113281, "rewards/KL_reward/std": 1.0717991590499878, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.015625, "epoch": 0.9070294784580499, "grad_norm": 1.1392728090286255, "learning_rate": 5e-05, "loss": 0.0007, "num_tokens": 72028132.0, "rewards/KL_reward/mean": -33.22903823852539, "rewards/KL_reward/std": 2.581110954284668, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9081632653061225, "grad_norm": 0.0020190200302749872, "learning_rate": 5e-05, "loss": 0.0014, "num_tokens": 72060028.0, "rewards/KL_reward/mean": -33.52140808105469, "rewards/KL_reward/std": 1.0689212083816528, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.909297052154195, "grad_norm": 0.10467184334993362, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 72092500.0, "rewards/KL_reward/mean": -33.260520935058594, "rewards/KL_reward/std": 1.2949765920639038, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9104308390022676, "grad_norm": 0.0027886980678886175, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 72124692.0, "rewards/KL_reward/mean": -33.974403381347656, "rewards/KL_reward/std": 1.119321346282959, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9115646258503401, "grad_norm": 0.010387484915554523, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 72157340.0, "rewards/KL_reward/mean": -32.96565246582031, "rewards/KL_reward/std": 1.5892037153244019, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9126984126984127, "grad_norm": 0.0026018789503723383, "learning_rate": 5e-05, "loss": -0.0, "num_tokens": 72189516.0, "rewards/KL_reward/mean": -33.66239547729492, "rewards/KL_reward/std": 0.877173900604248, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0234375, "epoch": 0.9138321995464853, "grad_norm": 0.0786629393696785, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 72221863.0, "rewards/KL_reward/mean": -33.49411392211914, "rewards/KL_reward/std": 2.0544230937957764, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9149659863945578, "grad_norm": 0.0015710083534941077, "learning_rate": 5e-05, "loss": -0.0009, "num_tokens": 72254455.0, "rewards/KL_reward/mean": -33.7327880859375, "rewards/KL_reward/std": 0.9656606912612915, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9160997732426304, "grad_norm": 0.009940247051417828, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 72286591.0, "rewards/KL_reward/mean": -33.457847595214844, "rewards/KL_reward/std": 1.3197795152664185, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9172335600907029, "grad_norm": 0.002746545011177659, "learning_rate": 5e-05, "loss": 0.0012, "num_tokens": 72318567.0, "rewards/KL_reward/mean": -33.70878219604492, "rewards/KL_reward/std": 1.5371614694595337, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9183673469387755, "grad_norm": 0.003681495087221265, "learning_rate": 5e-05, "loss": 0.0011, "num_tokens": 72350511.0, "rewards/KL_reward/mean": -33.79541778564453, "rewards/KL_reward/std": 1.3166762590408325, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9195011337868481, "grad_norm": 0.0012977722799405456, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 72382647.0, "rewards/KL_reward/mean": -33.16996765136719, "rewards/KL_reward/std": 1.4044597148895264, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9206349206349206, "grad_norm": 0.0006923092296347022, "learning_rate": 5e-05, "loss": 0.0, "num_tokens": 72414679.0, "rewards/KL_reward/mean": -33.57663345336914, "rewards/KL_reward/std": 0.7723235487937927, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9217687074829932, "grad_norm": 0.002347388304769993, "learning_rate": 5e-05, "loss": 0.0013, "num_tokens": 72446583.0, "rewards/KL_reward/mean": -33.96721267700195, "rewards/KL_reward/std": 1.482001781463623, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9229024943310657, "grad_norm": 0.005274900700896978, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 72478815.0, "rewards/KL_reward/mean": -33.25440216064453, "rewards/KL_reward/std": 1.6389936208724976, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0234375, "epoch": 0.9240362811791383, "grad_norm": 0.7483558058738708, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 72511170.0, "rewards/KL_reward/mean": -33.375099182128906, "rewards/KL_reward/std": 2.3296220302581787, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9251700680272109, "grad_norm": 0.007809279952198267, "learning_rate": 5e-05, "loss": -0.0024, "num_tokens": 72542842.0, "rewards/KL_reward/mean": -33.83318328857422, "rewards/KL_reward/std": 1.2061774730682373, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9263038548752834, "grad_norm": 0.0041608852334320545, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 72574522.0, "rewards/KL_reward/mean": -34.07609558105469, "rewards/KL_reward/std": 0.9378816485404968, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0234375, "epoch": 0.927437641723356, "grad_norm": 0.6836727857589722, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 72606389.0, "rewards/KL_reward/mean": -33.83430099487305, "rewards/KL_reward/std": 1.9606149196624756, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9285714285714286, "grad_norm": 0.0031801860313862562, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 72638325.0, "rewards/KL_reward/mean": -32.74711990356445, "rewards/KL_reward/std": 1.9061007499694824, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9297052154195011, "grad_norm": 0.0025071382988244295, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 72670381.0, "rewards/KL_reward/mean": -33.63817596435547, "rewards/KL_reward/std": 1.056267261505127, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9308390022675737, "grad_norm": 0.010071910917758942, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 72702093.0, "rewards/KL_reward/mean": -33.934120178222656, "rewards/KL_reward/std": 1.2471729516983032, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9319727891156463, "grad_norm": 0.0020680369343608618, "learning_rate": 5e-05, "loss": -0.0003, "num_tokens": 72734269.0, "rewards/KL_reward/mean": -33.41252136230469, "rewards/KL_reward/std": 1.0085152387619019, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.015625, "epoch": 0.9331065759637188, "grad_norm": 0.061341334134340286, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 72766319.0, "rewards/KL_reward/mean": -33.721580505371094, "rewards/KL_reward/std": 1.2540863752365112, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9342403628117913, "grad_norm": 0.002179608680307865, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 72798607.0, "rewards/KL_reward/mean": -33.82682800292969, "rewards/KL_reward/std": 1.0525864362716675, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.935374149659864, "grad_norm": 0.0029823859222233295, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 72830295.0, "rewards/KL_reward/mean": -33.834556579589844, "rewards/KL_reward/std": 1.2774964570999146, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9365079365079365, "grad_norm": 0.0008695355500094593, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 72862535.0, "rewards/KL_reward/mean": -33.31136703491211, "rewards/KL_reward/std": 0.9780426025390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.937641723356009, "grad_norm": 0.03273223340511322, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 72894839.0, "rewards/KL_reward/mean": -33.34189224243164, "rewards/KL_reward/std": 1.4788143634796143, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.9387755102040817, "grad_norm": 0.5189052820205688, "learning_rate": 5e-05, "loss": -0.0015, "num_tokens": 72926696.0, "rewards/KL_reward/mean": -33.972129821777344, "rewards/KL_reward/std": 2.086747169494629, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9399092970521542, "grad_norm": 0.0020410194993019104, "learning_rate": 5e-05, "loss": -0.0007, "num_tokens": 72958368.0, "rewards/KL_reward/mean": -34.186222076416016, "rewards/KL_reward/std": 1.0480031967163086, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9410430839002267, "grad_norm": 0.061405811458826065, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 72991000.0, "rewards/KL_reward/mean": -33.245933532714844, "rewards/KL_reward/std": 0.804006040096283, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9421768707482994, "grad_norm": 0.0010703267762437463, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 73022864.0, "rewards/KL_reward/mean": -33.59266662597656, "rewards/KL_reward/std": 1.5360360145568848, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9433106575963719, "grad_norm": 0.0038445971440523863, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 73054808.0, "rewards/KL_reward/mean": -33.263980865478516, "rewards/KL_reward/std": 1.106505274772644, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9444444444444444, "grad_norm": 0.007286259904503822, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 73087216.0, "rewards/KL_reward/mean": -33.16855239868164, "rewards/KL_reward/std": 1.4726207256317139, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9455782312925171, "grad_norm": 0.0012580020120367408, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 73119296.0, "rewards/KL_reward/mean": -34.0222053527832, "rewards/KL_reward/std": 1.0648142099380493, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0078125, "epoch": 0.9467120181405896, "grad_norm": 0.592292070388794, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 73151793.0, "rewards/KL_reward/mean": -33.82099151611328, "rewards/KL_reward/std": 2.220374584197998, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9478458049886621, "grad_norm": 0.0006550090620294213, "learning_rate": 5e-05, "loss": -0.0005, "num_tokens": 73182945.0, "rewards/KL_reward/mean": -33.92092514038086, "rewards/KL_reward/std": 0.9080770611763, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9489795918367347, "grad_norm": 0.1275930404663086, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 73214609.0, "rewards/KL_reward/mean": -33.773582458496094, "rewards/KL_reward/std": 1.3788979053497314, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9501133786848073, "grad_norm": 0.2163899689912796, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 73246073.0, "rewards/KL_reward/mean": -33.065582275390625, "rewards/KL_reward/std": 1.198525071144104, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9512471655328798, "grad_norm": 0.4263833463191986, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 73278121.0, "rewards/KL_reward/mean": -33.5583381652832, "rewards/KL_reward/std": 1.0508912801742554, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9523809523809523, "grad_norm": 0.007769063580781221, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 73309825.0, "rewards/KL_reward/mean": -33.93596649169922, "rewards/KL_reward/std": 1.1275330781936646, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.953514739229025, "grad_norm": 0.002262361813336611, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 73341953.0, "rewards/KL_reward/mean": -33.48320007324219, "rewards/KL_reward/std": 0.9198437333106995, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9546485260770975, "grad_norm": 0.0017249841475859284, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 73373857.0, "rewards/KL_reward/mean": -33.71734619140625, "rewards/KL_reward/std": 1.1636419296264648, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.95578231292517, "grad_norm": 0.005231876391917467, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 73405865.0, "rewards/KL_reward/mean": -32.96625518798828, "rewards/KL_reward/std": 1.9772436618804932, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9569160997732427, "grad_norm": 0.22509323060512543, "learning_rate": 5e-05, "loss": -0.0011, "num_tokens": 73437481.0, "rewards/KL_reward/mean": -33.50747299194336, "rewards/KL_reward/std": 1.3297713994979858, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.03125, "epoch": 0.9580498866213152, "grad_norm": 0.43399322032928467, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 73469845.0, "rewards/KL_reward/mean": -33.005149841308594, "rewards/KL_reward/std": 2.149930477142334, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.06629125773906708, "rewards/angle_reward/std": 0.24199791252613068, "rewards/thinking_verbosity_reward/mean": -0.0007812500116415322, "rewards/thinking_verbosity_reward/std": 0.006225345656275749, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9591836734693877, "grad_norm": 0.0004079754580743611, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 73502325.0, "rewards/KL_reward/mean": -34.061588287353516, "rewards/KL_reward/std": 0.9618063569068909, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9603174603174603, "grad_norm": 0.000760400842409581, "learning_rate": 5e-05, "loss": 0.0017, "num_tokens": 73534797.0, "rewards/KL_reward/mean": -33.50699996948242, "rewards/KL_reward/std": 0.8282644152641296, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9614512471655329, "grad_norm": 0.020066983997821808, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 73567221.0, "rewards/KL_reward/mean": -33.0126953125, "rewards/KL_reward/std": 1.1289455890655518, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9625850340136054, "grad_norm": 0.004405636806041002, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 73599277.0, "rewards/KL_reward/mean": -33.39598083496094, "rewards/KL_reward/std": 1.8640003204345703, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.963718820861678, "grad_norm": 0.0047976113855838776, "learning_rate": 5e-05, "loss": 0.0015, "num_tokens": 73630781.0, "rewards/KL_reward/mean": -33.9675178527832, "rewards/KL_reward/std": 1.5000041723251343, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9648526077097506, "grad_norm": 0.005626079626381397, "learning_rate": 5e-05, "loss": 0.0003, "num_tokens": 73662093.0, "rewards/KL_reward/mean": -34.15525436401367, "rewards/KL_reward/std": 1.5996448993682861, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9659863945578231, "grad_norm": 0.007012875750660896, "learning_rate": 5e-05, "loss": 0.0009, "num_tokens": 73694517.0, "rewards/KL_reward/mean": -33.43494415283203, "rewards/KL_reward/std": 1.1145339012145996, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9671201814058957, "grad_norm": 0.003037257120013237, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 73726869.0, "rewards/KL_reward/mean": -33.14628982543945, "rewards/KL_reward/std": 1.6235324144363403, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9682539682539683, "grad_norm": 0.002727856859564781, "learning_rate": 5e-05, "loss": -0.0004, "num_tokens": 73759517.0, "rewards/KL_reward/mean": -33.35026550292969, "rewards/KL_reward/std": 1.4061540365219116, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9693877551020408, "grad_norm": 0.14295855164527893, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 73791589.0, "rewards/KL_reward/mean": -33.52638244628906, "rewards/KL_reward/std": 1.6596015691757202, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9705215419501134, "grad_norm": 0.02205885760486126, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 73823589.0, "rewards/KL_reward/mean": -33.47103500366211, "rewards/KL_reward/std": 1.1936787366867065, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.971655328798186, "grad_norm": 0.004215400665998459, "learning_rate": 5e-05, "loss": -0.0015, "num_tokens": 73855749.0, "rewards/KL_reward/mean": -33.84236145019531, "rewards/KL_reward/std": 1.2295082807540894, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9727891156462585, "grad_norm": 0.002084847306832671, "learning_rate": 5e-05, "loss": -0.0002, "num_tokens": 73887389.0, "rewards/KL_reward/mean": -33.31908416748047, "rewards/KL_reward/std": 0.9242004156112671, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9739229024943311, "grad_norm": 0.010090108960866928, "learning_rate": 5e-05, "loss": -0.0014, "num_tokens": 73919005.0, "rewards/KL_reward/mean": -33.2784423828125, "rewards/KL_reward/std": 2.2050771713256836, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9750566893424036, "grad_norm": 0.014052431099116802, "learning_rate": 5e-05, "loss": 0.0027, "num_tokens": 73951565.0, "rewards/KL_reward/mean": -33.448123931884766, "rewards/KL_reward/std": 2.091165781021118, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9761904761904762, "grad_norm": 0.0007639123359695077, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 73983653.0, "rewards/KL_reward/mean": -34.18672561645508, "rewards/KL_reward/std": 0.844143807888031, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9773242630385488, "grad_norm": 0.0013192676706239581, "learning_rate": 5e-05, "loss": 0.0043, "num_tokens": 74016085.0, "rewards/KL_reward/mean": -33.311458587646484, "rewards/KL_reward/std": 1.3442423343658447, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9784580498866213, "grad_norm": 0.010709281079471111, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 74048101.0, "rewards/KL_reward/mean": -33.879364013671875, "rewards/KL_reward/std": 1.5034685134887695, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.015625, "epoch": 0.9795918367346939, "grad_norm": 0.4506100118160248, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 74079879.0, "rewards/KL_reward/mean": -34.168434143066406, "rewards/KL_reward/std": 1.8324439525604248, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9807256235827665, "grad_norm": 0.00034951631096191704, "learning_rate": 5e-05, "loss": -0.002, "num_tokens": 74111447.0, "rewards/KL_reward/mean": -33.98359680175781, "rewards/KL_reward/std": 1.1653918027877808, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.981859410430839, "grad_norm": 0.00833884160965681, "learning_rate": 5e-05, "loss": 0.0019, "num_tokens": 74143159.0, "rewards/KL_reward/mean": -33.90995407104492, "rewards/KL_reward/std": 1.0673109292984009, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9829931972789115, "grad_norm": 0.015380697324872017, "learning_rate": 5e-05, "loss": 0.0006, "num_tokens": 74175335.0, "rewards/KL_reward/mean": -33.579200744628906, "rewards/KL_reward/std": 1.330080509185791, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9841269841269841, "grad_norm": 0.0010960256913676858, "learning_rate": 5e-05, "loss": -0.0001, "num_tokens": 74207391.0, "rewards/KL_reward/mean": -33.26451873779297, "rewards/KL_reward/std": 0.9738898873329163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9852607709750567, "grad_norm": 0.0006172276334837079, "learning_rate": 5e-05, "loss": -0.0015, "num_tokens": 74239719.0, "rewards/KL_reward/mean": -33.499027252197266, "rewards/KL_reward/std": 1.2270981073379517, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9863945578231292, "grad_norm": 0.0011030619498342276, "learning_rate": 5e-05, "loss": -0.001, "num_tokens": 74271535.0, "rewards/KL_reward/mean": -33.678924560546875, "rewards/KL_reward/std": 1.0721451044082642, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9875283446712018, "grad_norm": 0.0012125244829803705, "learning_rate": 5e-05, "loss": -0.0008, "num_tokens": 74303423.0, "rewards/KL_reward/mean": -33.96786117553711, "rewards/KL_reward/std": 1.3098315000534058, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9886621315192744, "grad_norm": 0.0038846044335514307, "learning_rate": 5e-05, "loss": -0.0018, "num_tokens": 74336031.0, "rewards/KL_reward/mean": -33.63890838623047, "rewards/KL_reward/std": 1.1418358087539673, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9897959183673469, "grad_norm": 0.0008887458825483918, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 74367703.0, "rewards/KL_reward/mean": -33.608436584472656, "rewards/KL_reward/std": 0.9306058883666992, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.015625, "epoch": 0.9909297052154195, "grad_norm": 1.6916550397872925, "learning_rate": 5e-05, "loss": -0.0017, "num_tokens": 74399681.0, "rewards/KL_reward/mean": -33.1297721862793, "rewards/KL_reward/std": 2.368335247039795, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.03314562886953354, "rewards/angle_reward/std": 0.17432378232479095, "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, "rewards/thinking_verbosity_reward/std": 0.0044194171205163, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9920634920634921, "grad_norm": 0.0032412756700068712, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 74432561.0, "rewards/KL_reward/mean": -33.358238220214844, "rewards/KL_reward/std": 1.5701112747192383, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9931972789115646, "grad_norm": 0.14960019290447235, "learning_rate": 5e-05, "loss": -0.0006, "num_tokens": 74465073.0, "rewards/KL_reward/mean": -33.12785339355469, "rewards/KL_reward/std": 1.2311831712722778, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9943310657596371, "grad_norm": 0.0003175087331328541, "learning_rate": 5e-05, "loss": 0.0004, "num_tokens": 74496385.0, "rewards/KL_reward/mean": -33.74147415161133, "rewards/KL_reward/std": 1.2235573530197144, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9954648526077098, "grad_norm": 0.004698865581303835, "learning_rate": 5e-05, "loss": 0.0001, "num_tokens": 74528537.0, "rewards/KL_reward/mean": -34.1864013671875, "rewards/KL_reward/std": 1.1026948690414429, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9965986394557823, "grad_norm": 0.0013571645831689239, "learning_rate": 5e-05, "loss": 0.0002, "num_tokens": 74560633.0, "rewards/KL_reward/mean": -33.780052185058594, "rewards/KL_reward/std": 1.0348597764968872, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9977324263038548, "grad_norm": 0.0004332130483817309, "learning_rate": 5e-05, "loss": 0.0024, "num_tokens": 74591257.0, "rewards/KL_reward/mean": -34.17106628417969, "rewards/KL_reward/std": 1.0338495969772339, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 0.9988662131519275, "grad_norm": 0.019798073917627335, "learning_rate": 5e-05, "loss": 0.0008, "num_tokens": 74623361.0, "rewards/KL_reward/mean": -33.206809997558594, "rewards/KL_reward/std": 1.5338208675384521, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/mean_length": 1.0, "epoch": 1.0, "grad_norm": 0.009082326665520668, "learning_rate": 5e-05, "loss": 0.0005, "num_tokens": 74655785.0, "rewards/KL_reward/mean": -33.23743438720703, "rewards/KL_reward/std": 1.4101316928863525, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/angle_reward/mean": 0.0, "rewards/angle_reward/std": 0.0, "rewards/thinking_verbosity_reward/mean": 0.0, "rewards/thinking_verbosity_reward/std": 0.0, "step": 882 } ], "logging_steps": 1, "max_steps": 882, "num_input_tokens_seen": 74655785, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }