diff --git "a/checkpoint-882/trainer_state.json" "b/checkpoint-882/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-882/trainer_state.json" @@ -0,0 +1,19438 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 882, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 838.1328125, + "epoch": 0.0011337868480725624, + "grad_norm": 0.012316840700805187, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 138961.0, + "rewards/KL_reward/mean": 0.0, + "rewards/KL_reward/std": 0.0, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.03760823979973793, + "rewards/angle_reward/std": 0.7318449020385742, + "rewards/thinking_verbosity_reward/mean": -1.42296302318573, + "rewards/thinking_verbosity_reward/std": 0.26177090406417847, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 840.8203125, + "epoch": 0.0022675736961451248, + "grad_norm": 0.014643240720033646, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 278170.0, + "rewards/KL_reward/mean": -0.00014835168258287013, + "rewards/KL_reward/std": 0.0017017334466800094, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.07582557946443558, + "rewards/angle_reward/std": 0.7120820879936218, + "rewards/thinking_verbosity_reward/mean": -1.429264783859253, + "rewards/thinking_verbosity_reward/std": 0.23895855247974396, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1029.5078125, + "epoch": 0.003401360544217687, + "grad_norm": 0.013309244997799397, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 442091.0, + "rewards/KL_reward/mean": -0.00011519622057676315, + "rewards/KL_reward/std": 0.001265935366973281, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.04690488427877426, + "rewards/angle_reward/std": 0.708452582359314, + "rewards/thinking_verbosity_reward/mean": -1.560727834701538, + "rewards/thinking_verbosity_reward/std": 0.36943718791007996, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 827.4375, + "epoch": 0.0045351473922902496, + "grad_norm": 0.016940688714385033, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 580019.0, + "rewards/KL_reward/mean": -2.250075340270996e-06, + "rewards/KL_reward/std": 0.0013977407943457365, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": -0.018481537699699402, + "rewards/angle_reward/std": 0.7089322805404663, + "rewards/thinking_verbosity_reward/mean": -1.4036836624145508, + "rewards/thinking_verbosity_reward/std": 0.31064456701278687, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 898.796875, + "epoch": 0.005668934240362812, + "grad_norm": 0.012756886892020702, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 726905.0, + "rewards/KL_reward/mean": -0.00013490879791788757, + "rewards/KL_reward/std": 0.0015800945693627, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": 0.014750942587852478, + "rewards/angle_reward/std": 0.7069280743598938, + "rewards/thinking_verbosity_reward/mean": -1.478535771369934, + "rewards/thinking_verbosity_reward/std": 0.2427017092704773, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 992.28125, + "epoch": 0.006802721088435374, + "grad_norm": 0.014865963719785213, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 885477.0, + "rewards/KL_reward/mean": -0.00015435644309036434, + "rewards/KL_reward/std": 0.0012532330583781004, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.09406879544258118, + "rewards/angle_reward/std": 0.7392042875289917, + "rewards/thinking_verbosity_reward/mean": -1.5456072092056274, + "rewards/thinking_verbosity_reward/std": 0.2998106777667999, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 911.109375, + "epoch": 0.007936507936507936, + "grad_norm": 0.014299380593001842, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 1034707.0, + "rewards/KL_reward/mean": -8.136368705891073e-05, + "rewards/KL_reward/std": 0.0018703237874433398, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22826264798641205, + "rewards/angle_reward/mean": -0.015537131577730179, + "rewards/angle_reward/std": 0.6837882995605469, + "rewards/thinking_verbosity_reward/mean": -1.476283311843872, + "rewards/thinking_verbosity_reward/std": 0.31067222356796265, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 864.1796875, + "epoch": 0.009070294784580499, + "grad_norm": 0.014539482071995735, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 1177298.0, + "rewards/KL_reward/mean": -0.0003884948091581464, + "rewards/KL_reward/std": 0.001967003336176276, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.06809777766466141, + "rewards/angle_reward/std": 0.6925191283226013, + "rewards/thinking_verbosity_reward/mean": -1.4459242820739746, + "rewards/thinking_verbosity_reward/std": 0.26023587584495544, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 850.6171875, + "epoch": 0.01020408163265306, + "grad_norm": 0.015095122158527374, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 1318273.0, + "rewards/KL_reward/mean": -0.00043918390292674303, + "rewards/KL_reward/std": 0.0015299812657758594, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.01057947427034378, + "rewards/angle_reward/std": 0.7031927108764648, + "rewards/thinking_verbosity_reward/mean": -1.4381227493286133, + "rewards/thinking_verbosity_reward/std": 0.23724676668643951, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 838.921875, + "epoch": 0.011337868480725623, + "grad_norm": 0.016048606485128403, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 1457487.0, + "rewards/KL_reward/mean": -0.0006849928759038448, + "rewards/KL_reward/std": 0.0018519391305744648, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": 0.014378756284713745, + "rewards/angle_reward/std": 0.7116018533706665, + "rewards/thinking_verbosity_reward/mean": -1.4114718437194824, + "rewards/thinking_verbosity_reward/std": 0.3214956521987915, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 829.4453125, + "epoch": 0.012471655328798186, + "grad_norm": 0.015514638274908066, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 1596120.0, + "rewards/KL_reward/mean": -0.0008332775323651731, + "rewards/KL_reward/std": 0.0021017943508923054, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.020728295668959618, + "rewards/angle_reward/std": 0.6793127655982971, + "rewards/thinking_verbosity_reward/mean": -1.4123082160949707, + "rewards/thinking_verbosity_reward/std": 0.2772451639175415, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 751.203125, + "epoch": 0.013605442176870748, + "grad_norm": 0.015006035566329956, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 1724122.0, + "rewards/KL_reward/mean": -0.0006982996128499508, + "rewards/KL_reward/std": 0.0018833853537216783, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.042627930641174316, + "rewards/angle_reward/std": 0.7245418429374695, + "rewards/thinking_verbosity_reward/mean": -1.350346565246582, + "rewards/thinking_verbosity_reward/std": 0.22878825664520264, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 844.7578125, + "epoch": 0.01473922902494331, + "grad_norm": 0.014846655540168285, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 1864179.0, + "rewards/KL_reward/mean": -0.0004108635475859046, + "rewards/KL_reward/std": 0.00215906766243279, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.03426017612218857, + "rewards/angle_reward/std": 0.7090725302696228, + "rewards/thinking_verbosity_reward/mean": -1.435034990310669, + "rewards/thinking_verbosity_reward/std": 0.22442129254341125, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 829.796875, + "epoch": 0.015873015873015872, + "grad_norm": 0.019330566748976707, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 2002601.0, + "rewards/KL_reward/mean": -0.0008955916855484247, + "rewards/KL_reward/std": 0.002487297635525465, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": 0.008562322705984116, + "rewards/angle_reward/std": 0.699159562587738, + "rewards/thinking_verbosity_reward/mean": -1.4030687808990479, + "rewards/thinking_verbosity_reward/std": 0.32274603843688965, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 879.6875, + "epoch": 0.017006802721088437, + "grad_norm": 0.01438729465007782, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 2147689.0, + "rewards/KL_reward/mean": -0.000829770986456424, + "rewards/KL_reward/std": 0.002734147710725665, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.05539670214056969, + "rewards/angle_reward/std": 0.7079266905784607, + "rewards/thinking_verbosity_reward/mean": -1.4588990211486816, + "rewards/thinking_verbosity_reward/std": 0.2622072398662567, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 874.1484375, + "epoch": 0.018140589569160998, + "grad_norm": 0.012773919850587845, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2292084.0, + "rewards/KL_reward/mean": -0.0006347743328660727, + "rewards/KL_reward/std": 0.0020726905204355717, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.02527463063597679, + "rewards/angle_reward/std": 0.7010154724121094, + "rewards/thinking_verbosity_reward/mean": -1.4546244144439697, + "rewards/thinking_verbosity_reward/std": 0.2597421705722809, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 857.4375, + "epoch": 0.01927437641723356, + "grad_norm": 0.015292557887732983, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 2433140.0, + "rewards/KL_reward/mean": -0.00018716679187491536, + "rewards/KL_reward/std": 0.0016854844288900495, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": 0.08626461029052734, + "rewards/angle_reward/std": 0.7406377196311951, + "rewards/thinking_verbosity_reward/mean": -1.4333903789520264, + "rewards/thinking_verbosity_reward/std": 0.29524025321006775, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 810.0078125, + "epoch": 0.02040816326530612, + "grad_norm": 0.013641524128615856, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2568493.0, + "rewards/KL_reward/mean": -0.0009980101604014635, + "rewards/KL_reward/std": 0.0025766361504793167, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": -0.04107923433184624, + "rewards/angle_reward/std": 0.7071124315261841, + "rewards/thinking_verbosity_reward/mean": -1.3919694423675537, + "rewards/thinking_verbosity_reward/std": 0.2925904393196106, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 833.6484375, + "epoch": 0.021541950113378686, + "grad_norm": 0.01295141689479351, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2707016.0, + "rewards/KL_reward/mean": -0.0005243468331173062, + "rewards/KL_reward/std": 0.001648963545449078, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "rewards/angle_reward/mean": 0.04255552589893341, + "rewards/angle_reward/std": 0.6983751654624939, + "rewards/thinking_verbosity_reward/mean": -1.4239087104797363, + "rewards/thinking_verbosity_reward/std": 0.2333919256925583, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 698.40625, + "epoch": 0.022675736961451247, + "grad_norm": 0.01779448799788952, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2828884.0, + "rewards/KL_reward/mean": -0.0006695782649330795, + "rewards/KL_reward/std": 0.003075928892940283, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": -0.06875322014093399, + "rewards/angle_reward/std": 0.702839732170105, + "rewards/thinking_verbosity_reward/mean": -1.2984439134597778, + "rewards/thinking_verbosity_reward/std": 0.24081647396087646, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 830.3984375, + "epoch": 0.023809523809523808, + "grad_norm": 0.019104426726698875, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 2967359.0, + "rewards/KL_reward/mean": -0.00040592922596260905, + "rewards/KL_reward/std": 0.0025070447009056807, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": -0.03683070093393326, + "rewards/angle_reward/std": 0.7420152425765991, + "rewards/thinking_verbosity_reward/mean": -1.4153430461883545, + "rewards/thinking_verbosity_reward/std": 0.26599904894828796, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 788.46875, + "epoch": 0.024943310657596373, + "grad_norm": 0.014593786559998989, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 3100163.0, + "rewards/KL_reward/mean": -0.0007708030752837658, + "rewards/KL_reward/std": 0.001973372884094715, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.08793884515762329, + "rewards/angle_reward/std": 0.728395402431488, + "rewards/thinking_verbosity_reward/mean": -1.3775405883789062, + "rewards/thinking_verbosity_reward/std": 0.26757028698921204, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 895.7421875, + "epoch": 0.026077097505668934, + "grad_norm": 0.01430218294262886, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 3246506.0, + "rewards/KL_reward/mean": -0.0006098577287048101, + "rewards/KL_reward/std": 0.001767677254974842, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22826264798641205, + "rewards/angle_reward/mean": 0.06294762343168259, + "rewards/angle_reward/std": 0.6941813230514526, + "rewards/thinking_verbosity_reward/mean": -1.4643688201904297, + "rewards/thinking_verbosity_reward/std": 0.30500882863998413, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 891.09375, + "epoch": 0.027210884353741496, + "grad_norm": 0.01346637960523367, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 3392310.0, + "rewards/KL_reward/mean": -0.0006486388156190515, + "rewards/KL_reward/std": 0.0017589215422049165, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.020684920251369476, + "rewards/angle_reward/std": 0.7452264428138733, + "rewards/thinking_verbosity_reward/mean": -1.4671560525894165, + "rewards/thinking_verbosity_reward/std": 0.2706656754016876, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 783.2578125, + "epoch": 0.02834467120181406, + "grad_norm": 0.015406543388962746, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 3524327.0, + "rewards/KL_reward/mean": -0.0008891090401448309, + "rewards/KL_reward/std": 0.002750263549387455, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.07052788138389587, + "rewards/angle_reward/std": 0.6934146285057068, + "rewards/thinking_verbosity_reward/mean": -1.3774542808532715, + "rewards/thinking_verbosity_reward/std": 0.24232842028141022, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 907.2578125, + "epoch": 0.02947845804988662, + "grad_norm": 0.0155716547742486, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 3671576.0, + "rewards/KL_reward/mean": -0.0009992108680307865, + "rewards/KL_reward/std": 0.001991155557334423, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.0619647316634655, + "rewards/angle_reward/std": 0.713699221611023, + "rewards/thinking_verbosity_reward/mean": -1.463057518005371, + "rewards/thinking_verbosity_reward/std": 0.355150431394577, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 807.40625, + "epoch": 0.030612244897959183, + "grad_norm": 0.013339817523956299, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 3806604.0, + "rewards/KL_reward/mean": -0.0007779946317896247, + "rewards/KL_reward/std": 0.0016896483721211553, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": 0.05560939013957977, + "rewards/angle_reward/std": 0.6921024322509766, + "rewards/thinking_verbosity_reward/mean": -1.390915870666504, + "rewards/thinking_verbosity_reward/std": 0.28644102811813354, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 797.8984375, + "epoch": 0.031746031746031744, + "grad_norm": 0.017945559695363045, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 3941207.0, + "rewards/KL_reward/mean": -0.0016435969155281782, + "rewards/KL_reward/std": 0.003532285103574395, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.023568615317344666, + "rewards/angle_reward/std": 0.7035307288169861, + "rewards/thinking_verbosity_reward/mean": -1.3779566287994385, + "rewards/thinking_verbosity_reward/std": 0.3068205416202545, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 770.296875, + "epoch": 0.032879818594104306, + "grad_norm": 0.016661623492836952, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 4072077.0, + "rewards/KL_reward/mean": -0.0008901930414140224, + "rewards/KL_reward/std": 0.0017296381993219256, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "rewards/angle_reward/mean": 0.03749765455722809, + "rewards/angle_reward/std": 0.7588507533073425, + "rewards/thinking_verbosity_reward/mean": -1.3564424514770508, + "rewards/thinking_verbosity_reward/std": 0.28972840309143066, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 737.8671875, + "epoch": 0.034013605442176874, + "grad_norm": 0.017227329313755035, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 4198900.0, + "rewards/KL_reward/mean": -0.0016070909332484007, + "rewards/KL_reward/std": 0.0026566628366708755, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.08105626702308655, + "rewards/angle_reward/std": 0.7346787452697754, + "rewards/thinking_verbosity_reward/mean": -1.3244327306747437, + "rewards/thinking_verbosity_reward/std": 0.29772624373435974, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 743.7578125, + "epoch": 0.035147392290249435, + "grad_norm": 0.015487901866436005, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 4325405.0, + "rewards/KL_reward/mean": -0.001097002881579101, + "rewards/KL_reward/std": 0.0027626247610896826, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.017529528588056564, + "rewards/angle_reward/std": 0.737390398979187, + "rewards/thinking_verbosity_reward/mean": -1.336993932723999, + "rewards/thinking_verbosity_reward/std": 0.2641395628452301, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 817.65625, + "epoch": 0.036281179138321996, + "grad_norm": 0.014715392142534256, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 4462409.0, + "rewards/KL_reward/mean": -0.0015238930936902761, + "rewards/KL_reward/std": 0.0031906100921332836, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.08617420494556427, + "rewards/angle_reward/std": 0.703201174736023, + "rewards/thinking_verbosity_reward/mean": -1.3977710008621216, + "rewards/thinking_verbosity_reward/std": 0.2975722551345825, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 816.140625, + "epoch": 0.03741496598639456, + "grad_norm": 0.014042104594409466, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 4599411.0, + "rewards/KL_reward/mean": -0.001561877434141934, + "rewards/KL_reward/std": 0.0028319573029875755, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.0055811963975429535, + "rewards/angle_reward/std": 0.7309610843658447, + "rewards/thinking_verbosity_reward/mean": -1.3999909162521362, + "rewards/thinking_verbosity_reward/std": 0.2801964282989502, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 885.375, + "epoch": 0.03854875283446712, + "grad_norm": 0.013550758361816406, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 4744619.0, + "rewards/KL_reward/mean": -0.0006970397080294788, + "rewards/KL_reward/std": 0.002382143633440137, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.08778101950883865, + "rewards/angle_reward/std": 0.7114675641059875, + "rewards/thinking_verbosity_reward/mean": -1.4606428146362305, + "rewards/thinking_verbosity_reward/std": 0.27926936745643616, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 827.9921875, + "epoch": 0.03968253968253968, + "grad_norm": 0.013626561500132084, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 4882898.0, + "rewards/KL_reward/mean": -0.0010192693443968892, + "rewards/KL_reward/std": 0.0023207853082567453, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": 0.06453114748001099, + "rewards/angle_reward/std": 0.6974670886993408, + "rewards/thinking_verbosity_reward/mean": -1.4110902547836304, + "rewards/thinking_verbosity_reward/std": 0.27717676758766174, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 615.5234375, + "epoch": 0.04081632653061224, + "grad_norm": 0.016563985496759415, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 4992893.0, + "rewards/KL_reward/mean": -0.001065884716808796, + "rewards/KL_reward/std": 0.0026898810174316168, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "rewards/angle_reward/mean": 0.03843190148472786, + "rewards/angle_reward/std": 0.7020615935325623, + "rewards/thinking_verbosity_reward/mean": -1.2185498476028442, + "rewards/thinking_verbosity_reward/std": 0.22774890065193176, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 735.5078125, + "epoch": 0.04195011337868481, + "grad_norm": 0.01565818302333355, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5119166.0, + "rewards/KL_reward/mean": -0.0016780947335064411, + "rewards/KL_reward/std": 0.0030078997369855642, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "rewards/angle_reward/mean": -0.000943564809858799, + "rewards/angle_reward/std": 0.7384704947471619, + "rewards/thinking_verbosity_reward/mean": -1.3351643085479736, + "rewards/thinking_verbosity_reward/std": 0.2324395775794983, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 759.9375, + "epoch": 0.04308390022675737, + "grad_norm": 0.01692848466336727, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5248158.0, + "rewards/KL_reward/mean": -0.0013938448391854763, + "rewards/KL_reward/std": 0.0023707833606749773, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.02276681549847126, + "rewards/angle_reward/std": 0.735538899898529, + "rewards/thinking_verbosity_reward/mean": -1.3573532104492188, + "rewards/thinking_verbosity_reward/std": 0.2352214753627777, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 854.5234375, + "epoch": 0.04421768707482993, + "grad_norm": 0.01324634999036789, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5389617.0, + "rewards/KL_reward/mean": -0.0013230672338977456, + "rewards/KL_reward/std": 0.002388726221397519, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.0635545626282692, + "rewards/angle_reward/std": 0.7063141465187073, + "rewards/thinking_verbosity_reward/mean": -1.4416253566741943, + "rewards/thinking_verbosity_reward/std": 0.2365209311246872, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 804.671875, + "epoch": 0.045351473922902494, + "grad_norm": 0.015133047476410866, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5524679.0, + "rewards/KL_reward/mean": -0.001572504872456193, + "rewards/KL_reward/std": 0.002796533051878214, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.05160997435450554, + "rewards/angle_reward/std": 0.7147625684738159, + "rewards/thinking_verbosity_reward/mean": -1.3938100337982178, + "rewards/thinking_verbosity_reward/std": 0.2584945261478424, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 747.8046875, + "epoch": 0.046485260770975055, + "grad_norm": 0.016320038586854935, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5652870.0, + "rewards/KL_reward/mean": -0.0015140497125685215, + "rewards/KL_reward/std": 0.0029329690150916576, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.02412802167236805, + "rewards/angle_reward/std": 0.7106620073318481, + "rewards/thinking_verbosity_reward/mean": -1.340445876121521, + "rewards/thinking_verbosity_reward/std": 0.2660253047943115, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 841.0859375, + "epoch": 0.047619047619047616, + "grad_norm": 0.012567983008921146, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 5792441.0, + "rewards/KL_reward/mean": -0.001314467517659068, + "rewards/KL_reward/std": 0.0024355482310056686, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": 0.0059778811410069466, + "rewards/angle_reward/std": 0.7117019891738892, + "rewards/thinking_verbosity_reward/mean": -1.4328210353851318, + "rewards/thinking_verbosity_reward/std": 0.2181081622838974, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 853.53125, + "epoch": 0.048752834467120185, + "grad_norm": 0.014290316961705685, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 5934229.0, + "rewards/KL_reward/mean": -0.0013754046522080898, + "rewards/KL_reward/std": 0.002865551272407174, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21220162510871887, + "rewards/angle_reward/mean": -0.04012656584382057, + "rewards/angle_reward/std": 0.7322371602058411, + "rewards/thinking_verbosity_reward/mean": -1.4434272050857544, + "rewards/thinking_verbosity_reward/std": 0.2195524126291275, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 812.3984375, + "epoch": 0.049886621315192746, + "grad_norm": 0.015406670048832893, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 6070624.0, + "rewards/KL_reward/mean": -0.0012093674158677459, + "rewards/KL_reward/std": 0.002830892102792859, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": 0.06462673842906952, + "rewards/angle_reward/std": 0.7060084939002991, + "rewards/thinking_verbosity_reward/mean": -1.4061496257781982, + "rewards/thinking_verbosity_reward/std": 0.2272499054670334, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 769.0234375, + "epoch": 0.05102040816326531, + "grad_norm": 0.014261603355407715, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 6200923.0, + "rewards/KL_reward/mean": -0.0015996790025383234, + "rewards/KL_reward/std": 0.003018921473994851, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": -0.07584583759307861, + "rewards/angle_reward/std": 0.7126216292381287, + "rewards/thinking_verbosity_reward/mean": -1.3641209602355957, + "rewards/thinking_verbosity_reward/std": 0.2442130744457245, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 805.4453125, + "epoch": 0.05215419501133787, + "grad_norm": 0.015710052102804184, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 6336236.0, + "rewards/KL_reward/mean": -0.0011746239615604281, + "rewards/KL_reward/std": 0.0022468946408480406, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": 0.0019333362579345703, + "rewards/angle_reward/std": 0.7123419046401978, + "rewards/thinking_verbosity_reward/mean": -1.3931634426116943, + "rewards/thinking_verbosity_reward/std": 0.2658627927303314, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 747.8203125, + "epoch": 0.05328798185941043, + "grad_norm": 0.014637403190135956, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 6464125.0, + "rewards/KL_reward/mean": -0.0015386963495984674, + "rewards/KL_reward/std": 0.002992629073560238, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": 0.03532971814274788, + "rewards/angle_reward/std": 0.7095845341682434, + "rewards/thinking_verbosity_reward/mean": -1.3381673097610474, + "rewards/thinking_verbosity_reward/std": 0.2773464620113373, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 749.6015625, + "epoch": 0.05442176870748299, + "grad_norm": 0.015380950644612312, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 6591602.0, + "rewards/KL_reward/mean": -0.001205449691042304, + "rewards/KL_reward/std": 0.002618036000058055, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.0005743652582168579, + "rewards/angle_reward/std": 0.7198529243469238, + "rewards/thinking_verbosity_reward/mean": -1.337047815322876, + "rewards/thinking_verbosity_reward/std": 0.2906661331653595, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 831.671875, + "epoch": 0.05555555555555555, + "grad_norm": 0.013588406145572662, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 6730392.0, + "rewards/KL_reward/mean": -0.0013649301836267114, + "rewards/KL_reward/std": 0.0024287677370011806, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.1746762990951538, + "rewards/angle_reward/mean": 0.024919526651501656, + "rewards/angle_reward/std": 0.7198489308357239, + "rewards/thinking_verbosity_reward/mean": -1.419439435005188, + "rewards/thinking_verbosity_reward/std": 0.24967728555202484, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 867.8671875, + "epoch": 0.05668934240362812, + "grad_norm": 0.014480918645858765, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 6873375.0, + "rewards/KL_reward/mean": -0.0010345801711082458, + "rewards/KL_reward/std": 0.0020714737474918365, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": 0.09728822112083435, + "rewards/angle_reward/std": 0.7140886187553406, + "rewards/thinking_verbosity_reward/mean": -1.4425103664398193, + "rewards/thinking_verbosity_reward/std": 0.2950771152973175, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 763.890625, + "epoch": 0.05782312925170068, + "grad_norm": 0.01474759727716446, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 7003217.0, + "rewards/KL_reward/mean": -0.0014291137922555208, + "rewards/KL_reward/std": 0.002928144298493862, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.062387678772211075, + "rewards/angle_reward/std": 0.7102147936820984, + "rewards/thinking_verbosity_reward/mean": -1.3571451902389526, + "rewards/thinking_verbosity_reward/std": 0.25666871666908264, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 848.828125, + "epoch": 0.05895691609977324, + "grad_norm": 0.014926274307072163, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 7144267.0, + "rewards/KL_reward/mean": -0.001141536864452064, + "rewards/KL_reward/std": 0.002282851841300726, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": 0.015168175101280212, + "rewards/angle_reward/std": 0.7204251885414124, + "rewards/thinking_verbosity_reward/mean": -1.4233806133270264, + "rewards/thinking_verbosity_reward/std": 0.30707427859306335, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 696.6875, + "epoch": 0.060090702947845805, + "grad_norm": 0.016644051298499107, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 7265363.0, + "rewards/KL_reward/mean": -0.0015665598912164569, + "rewards/KL_reward/std": 0.0025091448333114386, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": -0.06638424098491669, + "rewards/angle_reward/std": 0.7038470506668091, + "rewards/thinking_verbosity_reward/mean": -1.2934876680374146, + "rewards/thinking_verbosity_reward/std": 0.25808796286582947, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 803.3046875, + "epoch": 0.061224489795918366, + "grad_norm": 0.014262320473790169, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 7400074.0, + "rewards/KL_reward/mean": -0.0014873250620439649, + "rewards/KL_reward/std": 0.003350914688780904, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.0006549600511789322, + "rewards/angle_reward/std": 0.7145742774009705, + "rewards/thinking_verbosity_reward/mean": -1.4003604650497437, + "rewards/thinking_verbosity_reward/std": 0.21219314634799957, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 712.109375, + "epoch": 0.06235827664399093, + "grad_norm": 0.015680724754929543, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 7522512.0, + "rewards/KL_reward/mean": -0.0015632398426532745, + "rewards/KL_reward/std": 0.002867954084649682, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.057161569595336914, + "rewards/angle_reward/std": 0.7097866535186768, + "rewards/thinking_verbosity_reward/mean": -1.3076016902923584, + "rewards/thinking_verbosity_reward/std": 0.26154884696006775, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 796.15625, + "epoch": 0.06349206349206349, + "grad_norm": 0.01348777674138546, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 7656388.0, + "rewards/KL_reward/mean": -0.0012216382892802358, + "rewards/KL_reward/std": 0.002764075295999646, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": -0.01048531848937273, + "rewards/angle_reward/std": 0.7025184631347656, + "rewards/thinking_verbosity_reward/mean": -1.3820040225982666, + "rewards/thinking_verbosity_reward/std": 0.28005632758140564, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 791.1328125, + "epoch": 0.06462585034013606, + "grad_norm": 0.015808461233973503, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 7789629.0, + "rewards/KL_reward/mean": -0.0014838757924735546, + "rewards/KL_reward/std": 0.0027472227811813354, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24301259219646454, + "rewards/angle_reward/mean": 0.027304016053676605, + "rewards/angle_reward/std": 0.6935069561004639, + "rewards/thinking_verbosity_reward/mean": -1.3721787929534912, + "rewards/thinking_verbosity_reward/std": 0.30532729625701904, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 655.4453125, + "epoch": 0.06575963718820861, + "grad_norm": 0.01579357124865055, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 7905598.0, + "rewards/KL_reward/mean": -0.0015653329901397228, + "rewards/KL_reward/std": 0.0028549018315970898, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.050490882247686386, + "rewards/angle_reward/std": 0.7122427225112915, + "rewards/thinking_verbosity_reward/mean": -1.2625588178634644, + "rewards/thinking_verbosity_reward/std": 0.20583978295326233, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 764.5, + "epoch": 0.06689342403628118, + "grad_norm": 0.01618257351219654, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 8035030.0, + "rewards/KL_reward/mean": -0.001291181892156601, + "rewards/KL_reward/std": 0.0025379913859069347, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": 0.026311999186873436, + "rewards/angle_reward/std": 0.7142142057418823, + "rewards/thinking_verbosity_reward/mean": -1.3533778190612793, + "rewards/thinking_verbosity_reward/std": 0.27879348397254944, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 773.453125, + "epoch": 0.06802721088435375, + "grad_norm": 0.013515808619558811, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8166064.0, + "rewards/KL_reward/mean": -0.0010551323648542166, + "rewards/KL_reward/std": 0.002373181516304612, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.11679263412952423, + "rewards/angle_reward/std": 0.7029911279678345, + "rewards/thinking_verbosity_reward/mean": -1.3738677501678467, + "rewards/thinking_verbosity_reward/std": 0.2095806747674942, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 909.96875, + "epoch": 0.0691609977324263, + "grad_norm": 0.013791844248771667, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8314388.0, + "rewards/KL_reward/mean": -0.0012126723304390907, + "rewards/KL_reward/std": 0.002127976855263114, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.008181052282452583, + "rewards/angle_reward/std": 0.7238855957984924, + "rewards/thinking_verbosity_reward/mean": -1.4832502603530884, + "rewards/thinking_verbosity_reward/std": 0.2702941596508026, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 604.140625, + "epoch": 0.07029478458049887, + "grad_norm": 0.020265107974410057, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8423910.0, + "rewards/KL_reward/mean": -0.0018225734820589423, + "rewards/KL_reward/std": 0.0034152804873883724, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": 0.039339445531368256, + "rewards/angle_reward/std": 0.713613748550415, + "rewards/thinking_verbosity_reward/mean": -1.2084236145019531, + "rewards/thinking_verbosity_reward/std": 0.218318372964859, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 781.0625, + "epoch": 0.07142857142857142, + "grad_norm": 0.014227538369596004, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8556422.0, + "rewards/KL_reward/mean": -0.0014920226531103253, + "rewards/KL_reward/std": 0.0025782333686947823, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.04792693629860878, + "rewards/angle_reward/std": 0.7073825001716614, + "rewards/thinking_verbosity_reward/mean": -1.37550687789917, + "rewards/thinking_verbosity_reward/std": 0.24198274314403534, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 828.125, + "epoch": 0.07256235827664399, + "grad_norm": 0.013525891117751598, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8694350.0, + "rewards/KL_reward/mean": -0.0012743088882416487, + "rewards/KL_reward/std": 0.0024931752122938633, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24301259219646454, + "rewards/angle_reward/mean": -0.06039245426654816, + "rewards/angle_reward/std": 0.7030278444290161, + "rewards/thinking_verbosity_reward/mean": -1.4096813201904297, + "rewards/thinking_verbosity_reward/std": 0.2851056754589081, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 859.4609375, + "epoch": 0.07369614512471655, + "grad_norm": 0.013323036022484303, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 8836177.0, + "rewards/KL_reward/mean": -0.0013674057554453611, + "rewards/KL_reward/std": 0.0025408368092030287, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.020052675157785416, + "rewards/angle_reward/std": 0.7065397500991821, + "rewards/thinking_verbosity_reward/mean": -1.4406501054763794, + "rewards/thinking_verbosity_reward/std": 0.2669748067855835, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 820.265625, + "epoch": 0.07482993197278912, + "grad_norm": 0.011960196308791637, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 8972659.0, + "rewards/KL_reward/mean": -0.0016648797318339348, + "rewards/KL_reward/std": 0.0024159452877938747, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.13370326161384583, + "rewards/angle_reward/std": 0.6665264368057251, + "rewards/thinking_verbosity_reward/mean": -1.402920126914978, + "rewards/thinking_verbosity_reward/std": 0.2838478088378906, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 735.5, + "epoch": 0.07596371882086168, + "grad_norm": 0.01711840182542801, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 9099099.0, + "rewards/KL_reward/mean": -0.0008906584698706865, + "rewards/KL_reward/std": 0.0025882436893880367, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.10576502978801727, + "rewards/angle_reward/std": 0.7215561866760254, + "rewards/thinking_verbosity_reward/mean": -1.328075647354126, + "rewards/thinking_verbosity_reward/std": 0.27021440863609314, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 745.984375, + "epoch": 0.07709750566893424, + "grad_norm": 0.015710189938545227, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 9226129.0, + "rewards/KL_reward/mean": -0.0013148458674550056, + "rewards/KL_reward/std": 0.002292931778356433, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "rewards/angle_reward/mean": 0.0033968668431043625, + "rewards/angle_reward/std": 0.6955055594444275, + "rewards/thinking_verbosity_reward/mean": -1.3397393226623535, + "rewards/thinking_verbosity_reward/std": 0.26094380021095276, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 731.171875, + "epoch": 0.0782312925170068, + "grad_norm": 0.013799438253045082, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 9351943.0, + "rewards/KL_reward/mean": -0.0013583763502538204, + "rewards/KL_reward/std": 0.0027255534660071135, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": 0.0322229377925396, + "rewards/angle_reward/std": 0.7012441158294678, + "rewards/thinking_verbosity_reward/mean": -1.3318828344345093, + "rewards/thinking_verbosity_reward/std": 0.2277809977531433, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 816.46875, + "epoch": 0.07936507936507936, + "grad_norm": 0.014650252647697926, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 9488251.0, + "rewards/KL_reward/mean": -0.0020846647676080465, + "rewards/KL_reward/std": 0.002860372420400381, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.04663441330194473, + "rewards/angle_reward/std": 0.7286649942398071, + "rewards/thinking_verbosity_reward/mean": -1.4011547565460205, + "rewards/thinking_verbosity_reward/std": 0.2757023870944977, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 691.546875, + "epoch": 0.08049886621315193, + "grad_norm": 0.017928482964634895, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 9608449.0, + "rewards/KL_reward/mean": -0.001543578808195889, + "rewards/KL_reward/std": 0.0029939457308501005, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.040303785353899, + "rewards/angle_reward/std": 0.7216225862503052, + "rewards/thinking_verbosity_reward/mean": -1.292057752609253, + "rewards/thinking_verbosity_reward/std": 0.23946501314640045, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 774.8046875, + "epoch": 0.08163265306122448, + "grad_norm": 0.016091670840978622, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 9739752.0, + "rewards/KL_reward/mean": -0.0016156400088220835, + "rewards/KL_reward/std": 0.0028179564978927374, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.046681374311447144, + "rewards/angle_reward/std": 0.7233835458755493, + "rewards/thinking_verbosity_reward/mean": -1.364489197731018, + "rewards/thinking_verbosity_reward/std": 0.2705446183681488, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 779.796875, + "epoch": 0.08276643990929705, + "grad_norm": 0.01477085892111063, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 9871790.0, + "rewards/KL_reward/mean": -0.001758662285283208, + "rewards/KL_reward/std": 0.002572139957919717, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": -0.06254503130912781, + "rewards/angle_reward/std": 0.7012284398078918, + "rewards/thinking_verbosity_reward/mean": -1.3785450458526611, + "rewards/thinking_verbosity_reward/std": 0.21673206984996796, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 803.3984375, + "epoch": 0.08390022675736962, + "grad_norm": 0.013954821974039078, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 10006401.0, + "rewards/KL_reward/mean": -0.002070910297334194, + "rewards/KL_reward/std": 0.0029955198988318443, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24301259219646454, + "rewards/angle_reward/mean": 0.007975263521075249, + "rewards/angle_reward/std": 0.719423234462738, + "rewards/thinking_verbosity_reward/mean": -1.3989970684051514, + "rewards/thinking_verbosity_reward/std": 0.22138263285160065, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 865.4375, + "epoch": 0.08503401360544217, + "grad_norm": 0.015697991475462914, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 10149401.0, + "rewards/KL_reward/mean": -0.0014827789273113012, + "rewards/KL_reward/std": 0.0024286627303808928, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": -0.031810443848371506, + "rewards/angle_reward/std": 0.7595731616020203, + "rewards/thinking_verbosity_reward/mean": -1.4320918321609497, + "rewards/thinking_verbosity_reward/std": 0.33330804109573364, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 815.40625, + "epoch": 0.08616780045351474, + "grad_norm": 0.015063408762216568, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 10285349.0, + "rewards/KL_reward/mean": -0.0011845446424558759, + "rewards/KL_reward/std": 0.0025990239810198545, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22826264798641205, + "rewards/angle_reward/mean": 0.014048881828784943, + "rewards/angle_reward/std": 0.6989704370498657, + "rewards/thinking_verbosity_reward/mean": -1.4028055667877197, + "rewards/thinking_verbosity_reward/std": 0.2620105445384979, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 755.0546875, + "epoch": 0.0873015873015873, + "grad_norm": 0.014532950706779957, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 10414204.0, + "rewards/KL_reward/mean": -0.00224322103895247, + "rewards/KL_reward/std": 0.00291983550414443, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.050674110651016235, + "rewards/angle_reward/std": 0.7175518274307251, + "rewards/thinking_verbosity_reward/mean": -1.3487523794174194, + "rewards/thinking_verbosity_reward/std": 0.257921040058136, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 814.984375, + "epoch": 0.08843537414965986, + "grad_norm": 0.016100991517305374, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 10550802.0, + "rewards/KL_reward/mean": -0.0012918481370434165, + "rewards/KL_reward/std": 0.002293823752552271, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.059970177710056305, + "rewards/angle_reward/std": 0.695135235786438, + "rewards/thinking_verbosity_reward/mean": -1.410853385925293, + "rewards/thinking_verbosity_reward/std": 0.21162235736846924, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 719.9609375, + "epoch": 0.08956916099773243, + "grad_norm": 0.01754230074584484, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 10674333.0, + "rewards/KL_reward/mean": -0.0015553045086562634, + "rewards/KL_reward/std": 0.003376440843567252, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.0025256406515836716, + "rewards/angle_reward/std": 0.7213703393936157, + "rewards/thinking_verbosity_reward/mean": -1.316839575767517, + "rewards/thinking_verbosity_reward/std": 0.2525773346424103, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 828.6328125, + "epoch": 0.09070294784580499, + "grad_norm": 0.014959538355469704, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 10812118.0, + "rewards/KL_reward/mean": -0.0014992081560194492, + "rewards/KL_reward/std": 0.0026648149359971285, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": 0.02252374216914177, + "rewards/angle_reward/std": 0.7095814347267151, + "rewards/thinking_verbosity_reward/mean": -1.3917865753173828, + "rewards/thinking_verbosity_reward/std": 0.36484360694885254, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 787.171875, + "epoch": 0.09183673469387756, + "grad_norm": 0.013848821632564068, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 10944932.0, + "rewards/KL_reward/mean": -0.0012344918213784695, + "rewards/KL_reward/std": 0.0031588769052177668, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "rewards/angle_reward/mean": -0.036400556564331055, + "rewards/angle_reward/std": 0.7025420069694519, + "rewards/thinking_verbosity_reward/mean": -1.3798415660858154, + "rewards/thinking_verbosity_reward/std": 0.24886053800582886, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 797.65625, + "epoch": 0.09297052154195011, + "grad_norm": 0.016953621059656143, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 11079456.0, + "rewards/KL_reward/mean": -0.0018349254969507456, + "rewards/KL_reward/std": 0.0028502477798610926, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.09390395134687424, + "rewards/angle_reward/std": 0.7117813229560852, + "rewards/thinking_verbosity_reward/mean": -1.3859490156173706, + "rewards/thinking_verbosity_reward/std": 0.2669912278652191, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 712.2890625, + "epoch": 0.09410430839002268, + "grad_norm": 0.01498359628021717, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 11202453.0, + "rewards/KL_reward/mean": -0.0020972711499780416, + "rewards/KL_reward/std": 0.002872837008908391, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.01293040532618761, + "rewards/angle_reward/std": 0.7263043522834778, + "rewards/thinking_verbosity_reward/mean": -1.3143742084503174, + "rewards/thinking_verbosity_reward/std": 0.22596809267997742, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 744.9765625, + "epoch": 0.09523809523809523, + "grad_norm": 0.014811373315751553, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 11329906.0, + "rewards/KL_reward/mean": -0.0017401642398908734, + "rewards/KL_reward/std": 0.0029196979012340307, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": 0.05458701029419899, + "rewards/angle_reward/std": 0.7082171440124512, + "rewards/thinking_verbosity_reward/mean": -1.3352049589157104, + "rewards/thinking_verbosity_reward/std": 0.27892017364501953, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 806.9296875, + "epoch": 0.0963718820861678, + "grad_norm": 0.014128442853689194, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 11465633.0, + "rewards/KL_reward/mean": -0.0013117834459990263, + "rewards/KL_reward/std": 0.003301942953839898, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": -0.008452286012470722, + "rewards/angle_reward/std": 0.713223934173584, + "rewards/thinking_verbosity_reward/mean": -1.3895466327667236, + "rewards/thinking_verbosity_reward/std": 0.29090580344200134, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 791.3125, + "epoch": 0.09750566893424037, + "grad_norm": 0.016451245173811913, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 11598353.0, + "rewards/KL_reward/mean": -0.002211233600974083, + "rewards/KL_reward/std": 0.003300619777292013, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.019138764590024948, + "rewards/angle_reward/std": 0.6985461711883545, + "rewards/thinking_verbosity_reward/mean": -1.368886947631836, + "rewards/thinking_verbosity_reward/std": 0.32039639353752136, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 812.6875, + "epoch": 0.09863945578231292, + "grad_norm": 0.015922971069812775, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 11734393.0, + "rewards/KL_reward/mean": -0.0019225740106776357, + "rewards/KL_reward/std": 0.003200812265276909, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.15048927068710327, + "rewards/angle_reward/std": 0.701505720615387, + "rewards/thinking_verbosity_reward/mean": -1.4007548093795776, + "rewards/thinking_verbosity_reward/std": 0.26006340980529785, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 756.71875, + "epoch": 0.09977324263038549, + "grad_norm": 0.017682049423456192, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 11863573.0, + "rewards/KL_reward/mean": -0.0021198848262429237, + "rewards/KL_reward/std": 0.0028244415298104286, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": 0.05446275323629379, + "rewards/angle_reward/std": 0.7076202034950256, + "rewards/thinking_verbosity_reward/mean": -1.344355583190918, + "rewards/thinking_verbosity_reward/std": 0.2874905467033386, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 716.015625, + "epoch": 0.10090702947845805, + "grad_norm": 0.015480165369808674, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 11986591.0, + "rewards/KL_reward/mean": -0.002201077062636614, + "rewards/KL_reward/std": 0.0028800820000469685, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.10327331721782684, + "rewards/angle_reward/std": 0.6940180063247681, + "rewards/thinking_verbosity_reward/mean": -1.3169283866882324, + "rewards/thinking_verbosity_reward/std": 0.2315988689661026, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 789.28125, + "epoch": 0.10204081632653061, + "grad_norm": 0.0141932163387537, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 12119795.0, + "rewards/KL_reward/mean": -0.00178247201256454, + "rewards/KL_reward/std": 0.002433969872072339, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.04347587376832962, + "rewards/angle_reward/std": 0.7216427326202393, + "rewards/thinking_verbosity_reward/mean": -1.3841197490692139, + "rewards/thinking_verbosity_reward/std": 0.2351774126291275, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 691.609375, + "epoch": 0.10317460317460317, + "grad_norm": 0.01886655017733574, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 12240073.0, + "rewards/KL_reward/mean": -0.002210551407188177, + "rewards/KL_reward/std": 0.0031716807279735804, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.06363484263420105, + "rewards/angle_reward/std": 0.6967713832855225, + "rewards/thinking_verbosity_reward/mean": -1.2888851165771484, + "rewards/thinking_verbosity_reward/std": 0.25650152564048767, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 731.5546875, + "epoch": 0.10430839002267574, + "grad_norm": 0.01625211536884308, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 12365576.0, + "rewards/KL_reward/mean": -0.0018134694546461105, + "rewards/KL_reward/std": 0.0029981709085404873, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.0358327217400074, + "rewards/angle_reward/std": 0.6934579610824585, + "rewards/thinking_verbosity_reward/mean": -1.3283206224441528, + "rewards/thinking_verbosity_reward/std": 0.249838188290596, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 791.09375, + "epoch": 0.1054421768707483, + "grad_norm": 0.013530511409044266, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 12498508.0, + "rewards/KL_reward/mean": -0.0023961542174220085, + "rewards/KL_reward/std": 0.0037159009370952845, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.08895541727542877, + "rewards/angle_reward/std": 0.7009344696998596, + "rewards/thinking_verbosity_reward/mean": -1.3683531284332275, + "rewards/thinking_verbosity_reward/std": 0.3219837248325348, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 803.390625, + "epoch": 0.10657596371882086, + "grad_norm": 0.012814110144972801, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 12633734.0, + "rewards/KL_reward/mean": -0.0018958018627017736, + "rewards/KL_reward/std": 0.0025667615700513124, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": 0.001182081177830696, + "rewards/angle_reward/std": 0.6966385841369629, + "rewards/thinking_verbosity_reward/mean": -1.3940694332122803, + "rewards/thinking_verbosity_reward/std": 0.2510763704776764, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 775.1796875, + "epoch": 0.10770975056689343, + "grad_norm": 0.01644972898066044, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 12765061.0, + "rewards/KL_reward/mean": -0.0021666029933840036, + "rewards/KL_reward/std": 0.003533849259838462, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": 0.01855868473649025, + "rewards/angle_reward/std": 0.692621648311615, + "rewards/thinking_verbosity_reward/mean": -1.3660519123077393, + "rewards/thinking_verbosity_reward/std": 0.2643062174320221, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 710.265625, + "epoch": 0.10884353741496598, + "grad_norm": 0.016994789242744446, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 12888039.0, + "rewards/KL_reward/mean": -0.0026729642413556576, + "rewards/KL_reward/std": 0.004195652902126312, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.0013782698661088943, + "rewards/angle_reward/std": 0.7274826765060425, + "rewards/thinking_verbosity_reward/mean": -1.2989205121994019, + "rewards/thinking_verbosity_reward/std": 0.2943909466266632, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 813.5703125, + "epoch": 0.10997732426303855, + "grad_norm": 0.012327135540544987, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 13023496.0, + "rewards/KL_reward/mean": -0.0015367217129096389, + "rewards/KL_reward/std": 0.002783792093396187, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.03059232421219349, + "rewards/angle_reward/std": 0.6990124583244324, + "rewards/thinking_verbosity_reward/mean": -1.3972327709197998, + "rewards/thinking_verbosity_reward/std": 0.2823668420314789, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 776.2265625, + "epoch": 0.1111111111111111, + "grad_norm": 0.015816690400242805, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 13154485.0, + "rewards/KL_reward/mean": -0.001837437623180449, + "rewards/KL_reward/std": 0.003085511038079858, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": -0.0798722580075264, + "rewards/angle_reward/std": 0.7076959609985352, + "rewards/thinking_verbosity_reward/mean": -1.359372854232788, + "rewards/thinking_verbosity_reward/std": 0.30149850249290466, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 702.625, + "epoch": 0.11224489795918367, + "grad_norm": 0.017176752910017967, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 13276269.0, + "rewards/KL_reward/mean": -0.002208163496106863, + "rewards/KL_reward/std": 0.0035135303623974323, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": -0.028847502544522285, + "rewards/angle_reward/std": 0.7246776819229126, + "rewards/thinking_verbosity_reward/mean": -1.2869293689727783, + "rewards/thinking_verbosity_reward/std": 0.3140481412410736, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 740.4609375, + "epoch": 0.11337868480725624, + "grad_norm": 0.016666896641254425, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 13403208.0, + "rewards/KL_reward/mean": -0.0020394723396748304, + "rewards/KL_reward/std": 0.003099593333899975, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": -0.04304052144289017, + "rewards/angle_reward/std": 0.718730092048645, + "rewards/thinking_verbosity_reward/mean": -1.3422832489013672, + "rewards/thinking_verbosity_reward/std": 0.2173442840576172, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 673.9296875, + "epoch": 0.1145124716553288, + "grad_norm": 0.016901105642318726, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 13521735.0, + "rewards/KL_reward/mean": -0.002501721028238535, + "rewards/KL_reward/std": 0.0035446849651634693, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": 0.05400076508522034, + "rewards/angle_reward/std": 0.7020770311355591, + "rewards/thinking_verbosity_reward/mean": -1.2788972854614258, + "rewards/thinking_verbosity_reward/std": 0.21710321307182312, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 720.796875, + "epoch": 0.11564625850340136, + "grad_norm": 0.01597214862704277, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 13646085.0, + "rewards/KL_reward/mean": -0.0022737011313438416, + "rewards/KL_reward/std": 0.0033756305929273367, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.07139294594526291, + "rewards/angle_reward/std": 0.7032915949821472, + "rewards/thinking_verbosity_reward/mean": -1.3200069665908813, + "rewards/thinking_verbosity_reward/std": 0.2397989183664322, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 830.6875, + "epoch": 0.11678004535147392, + "grad_norm": 0.01694324240088463, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 13784285.0, + "rewards/KL_reward/mean": -0.0016397619619965553, + "rewards/KL_reward/std": 0.002778163179755211, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": -0.07758922874927521, + "rewards/angle_reward/std": 0.7181770205497742, + "rewards/thinking_verbosity_reward/mean": -1.4208343029022217, + "rewards/thinking_verbosity_reward/std": 0.23640063405036926, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 741.453125, + "epoch": 0.11791383219954649, + "grad_norm": 0.016481440514326096, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 13910383.0, + "rewards/KL_reward/mean": -0.0017886109417304397, + "rewards/KL_reward/std": 0.0030333735048770905, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": 0.014164380729198456, + "rewards/angle_reward/std": 0.7332705855369568, + "rewards/thinking_verbosity_reward/mean": -1.341301679611206, + "rewards/thinking_verbosity_reward/std": 0.22893868386745453, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 722.7890625, + "epoch": 0.11904761904761904, + "grad_norm": 0.015603181906044483, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 14035084.0, + "rewards/KL_reward/mean": -0.0022284667938947678, + "rewards/KL_reward/std": 0.0033084414899349213, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21220162510871887, + "rewards/angle_reward/mean": 0.010492192581295967, + "rewards/angle_reward/std": 0.7046900987625122, + "rewards/thinking_verbosity_reward/mean": -1.3199641704559326, + "rewards/thinking_verbosity_reward/std": 0.25039249658584595, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 792.3984375, + "epoch": 0.12018140589569161, + "grad_norm": 0.012676805257797241, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14167983.0, + "rewards/KL_reward/mean": -0.002145292004570365, + "rewards/KL_reward/std": 0.0035973675549030304, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.030615121126174927, + "rewards/angle_reward/std": 0.6757726073265076, + "rewards/thinking_verbosity_reward/mean": -1.3784376382827759, + "rewards/thinking_verbosity_reward/std": 0.2810404598712921, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 782.265625, + "epoch": 0.12131519274376418, + "grad_norm": 0.018651586025953293, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 14299953.0, + "rewards/KL_reward/mean": -0.0014412910677492619, + "rewards/KL_reward/std": 0.0026455435436218977, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.17002493143081665, + "rewards/angle_reward/std": 0.7069966793060303, + "rewards/thinking_verbosity_reward/mean": -1.376008152961731, + "rewards/thinking_verbosity_reward/std": 0.24543040990829468, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 748.109375, + "epoch": 0.12244897959183673, + "grad_norm": 0.0168473981320858, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14427551.0, + "rewards/KL_reward/mean": -0.001817174255847931, + "rewards/KL_reward/std": 0.0029042731039226055, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": -0.09525615721940994, + "rewards/angle_reward/std": 0.7056543231010437, + "rewards/thinking_verbosity_reward/mean": -1.3519896268844604, + "rewards/thinking_verbosity_reward/std": 0.20033209025859833, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 894.53125, + "epoch": 0.1235827664399093, + "grad_norm": 0.014377085492014885, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14573499.0, + "rewards/KL_reward/mean": -0.0022030770778656006, + "rewards/KL_reward/std": 0.00294887856580317, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": 0.006665170192718506, + "rewards/angle_reward/std": 0.7199759483337402, + "rewards/thinking_verbosity_reward/mean": -1.4535714387893677, + "rewards/thinking_verbosity_reward/std": 0.3491290807723999, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 662.0234375, + "epoch": 0.12471655328798185, + "grad_norm": 0.01902906969189644, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14690518.0, + "rewards/KL_reward/mean": -0.0027123568579554558, + "rewards/KL_reward/std": 0.0032277333084493876, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.04106944054365158, + "rewards/angle_reward/std": 0.7126646637916565, + "rewards/thinking_verbosity_reward/mean": -1.2683844566345215, + "rewards/thinking_verbosity_reward/std": 0.20977459847927094, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 832.5390625, + "epoch": 0.12585034013605442, + "grad_norm": 0.01605716161429882, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14829395.0, + "rewards/KL_reward/mean": -0.0022059683687984943, + "rewards/KL_reward/std": 0.0037288935855031013, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.023672189563512802, + "rewards/angle_reward/std": 0.7149680256843567, + "rewards/thinking_verbosity_reward/mean": -1.4242630004882812, + "rewards/thinking_verbosity_reward/std": 0.2249453365802765, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 676.484375, + "epoch": 0.12698412698412698, + "grad_norm": 0.017349785193800926, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 14947665.0, + "rewards/KL_reward/mean": -0.003657208289951086, + "rewards/KL_reward/std": 0.003820694051682949, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.020158935338258743, + "rewards/angle_reward/std": 0.7064082622528076, + "rewards/thinking_verbosity_reward/mean": -1.2656853199005127, + "rewards/thinking_verbosity_reward/std": 0.29552730917930603, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 718.1640625, + "epoch": 0.12811791383219956, + "grad_norm": 0.016457414254546165, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15070934.0, + "rewards/KL_reward/mean": -0.002713849302381277, + "rewards/KL_reward/std": 0.0035665060859173536, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.03854944556951523, + "rewards/angle_reward/std": 0.7127638459205627, + "rewards/thinking_verbosity_reward/mean": -1.309330940246582, + "rewards/thinking_verbosity_reward/std": 0.28132155537605286, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 708.5, + "epoch": 0.1292517006802721, + "grad_norm": 0.014944375492632389, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15193350.0, + "rewards/KL_reward/mean": -0.0027939858846366405, + "rewards/KL_reward/std": 0.003191334195435047, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": 0.061815641820430756, + "rewards/angle_reward/std": 0.7114821672439575, + "rewards/thinking_verbosity_reward/mean": -1.3131558895111084, + "rewards/thinking_verbosity_reward/std": 0.21124057471752167, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 685.53125, + "epoch": 0.13038548752834467, + "grad_norm": 0.017111442983150482, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15312882.0, + "rewards/KL_reward/mean": -0.0031511278357356787, + "rewards/KL_reward/std": 0.0033508387859910727, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.02089923806488514, + "rewards/angle_reward/std": 0.6855220198631287, + "rewards/thinking_verbosity_reward/mean": -1.2856812477111816, + "rewards/thinking_verbosity_reward/std": 0.2425108700990677, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 742.6484375, + "epoch": 0.13151927437641722, + "grad_norm": 0.017757011577486992, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15439445.0, + "rewards/KL_reward/mean": -0.002031938638538122, + "rewards/KL_reward/std": 0.003581864293664694, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.057499468326568604, + "rewards/angle_reward/std": 0.7282629609107971, + "rewards/thinking_verbosity_reward/mean": -1.3443570137023926, + "rewards/thinking_verbosity_reward/std": 0.21719655394554138, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 662.171875, + "epoch": 0.1326530612244898, + "grad_norm": 0.01628716289997101, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15556275.0, + "rewards/KL_reward/mean": -0.0032515444327145815, + "rewards/KL_reward/std": 0.00442019198089838, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21220162510871887, + "rewards/angle_reward/mean": -0.044349588453769684, + "rewards/angle_reward/std": 0.7159713506698608, + "rewards/thinking_verbosity_reward/mean": -1.2610831260681152, + "rewards/thinking_verbosity_reward/std": 0.2511417865753174, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 731.6953125, + "epoch": 0.13378684807256236, + "grad_norm": 0.014140671119093895, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 15682652.0, + "rewards/KL_reward/mean": -0.00220735976472497, + "rewards/KL_reward/std": 0.0038968021981418133, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": 0.014626394957304, + "rewards/angle_reward/std": 0.7016589641571045, + "rewards/thinking_verbosity_reward/mean": -1.3368642330169678, + "rewards/thinking_verbosity_reward/std": 0.19941078126430511, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 744.453125, + "epoch": 0.1349206349206349, + "grad_norm": 0.017708374187350273, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 15810318.0, + "rewards/KL_reward/mean": -0.0032416447065770626, + "rewards/KL_reward/std": 0.004031994380056858, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": 0.019819986075162888, + "rewards/angle_reward/std": 0.7375350594520569, + "rewards/thinking_verbosity_reward/mean": -1.3275351524353027, + "rewards/thinking_verbosity_reward/std": 0.31135663390159607, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 739.1875, + "epoch": 0.1360544217687075, + "grad_norm": 0.015575222671031952, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 15936750.0, + "rewards/KL_reward/mean": -0.0036213041748851538, + "rewards/KL_reward/std": 0.00411019753664732, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.02828877791762352, + "rewards/angle_reward/std": 0.723593533039093, + "rewards/thinking_verbosity_reward/mean": -1.3370579481124878, + "rewards/thinking_verbosity_reward/std": 0.24100078642368317, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 748.1484375, + "epoch": 0.13718820861678005, + "grad_norm": 0.0148308789357543, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 16064481.0, + "rewards/KL_reward/mean": -0.0030179324094206095, + "rewards/KL_reward/std": 0.004045294597744942, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.026180170476436615, + "rewards/angle_reward/std": 0.6928780674934387, + "rewards/thinking_verbosity_reward/mean": -1.3432364463806152, + "rewards/thinking_verbosity_reward/std": 0.25303834676742554, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 706.546875, + "epoch": 0.1383219954648526, + "grad_norm": 0.0160621777176857, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 16187199.0, + "rewards/KL_reward/mean": -0.0030728490091860294, + "rewards/KL_reward/std": 0.0036757541820406914, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": 0.03859724476933479, + "rewards/angle_reward/std": 0.7228646874427795, + "rewards/thinking_verbosity_reward/mean": -1.3010919094085693, + "rewards/thinking_verbosity_reward/std": 0.2672620415687561, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 731.1796875, + "epoch": 0.13945578231292516, + "grad_norm": 0.015178170055150986, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 16312910.0, + "rewards/KL_reward/mean": -0.0029267228674143553, + "rewards/KL_reward/std": 0.003854723647236824, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24301259219646454, + "rewards/angle_reward/mean": 0.009864788502454758, + "rewards/angle_reward/std": 0.7046127319335938, + "rewards/thinking_verbosity_reward/mean": -1.335146188735962, + "rewards/thinking_verbosity_reward/std": 0.20772947371006012, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 719.2734375, + "epoch": 0.14058956916099774, + "grad_norm": 0.017491133883595467, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 16436281.0, + "rewards/KL_reward/mean": -0.0029566772282123566, + "rewards/KL_reward/std": 0.003907571081072092, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.021206554025411606, + "rewards/angle_reward/std": 0.7237843871116638, + "rewards/thinking_verbosity_reward/mean": -1.3069604635238647, + "rewards/thinking_verbosity_reward/std": 0.29699769616127014, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 771.0078125, + "epoch": 0.1417233560090703, + "grad_norm": 0.014866192825138569, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 16567234.0, + "rewards/KL_reward/mean": -0.0039398204535245895, + "rewards/KL_reward/std": 0.004764980636537075, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.06120563670992851, + "rewards/angle_reward/std": 0.6730445027351379, + "rewards/thinking_verbosity_reward/mean": -1.3563966751098633, + "rewards/thinking_verbosity_reward/std": 0.2928489148616791, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 579.8359375, + "epoch": 0.14285714285714285, + "grad_norm": 0.019122039899230003, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 16673613.0, + "rewards/KL_reward/mean": -0.0043771034106612206, + "rewards/KL_reward/std": 0.004734088201075792, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.03073747828602791, + "rewards/angle_reward/std": 0.7280511856079102, + "rewards/thinking_verbosity_reward/mean": -1.182072639465332, + "rewards/thinking_verbosity_reward/std": 0.2240225225687027, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 734.953125, + "epoch": 0.14399092970521543, + "grad_norm": 0.017316479235887527, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 16799879.0, + "rewards/KL_reward/mean": -0.0034142176155000925, + "rewards/KL_reward/std": 0.004490741994231939, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.04686341434717178, + "rewards/angle_reward/std": 0.7065085172653198, + "rewards/thinking_verbosity_reward/mean": -1.333661437034607, + "rewards/thinking_verbosity_reward/std": 0.2379365712404251, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 695.78125, + "epoch": 0.14512471655328799, + "grad_norm": 0.016514454036951065, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 16921403.0, + "rewards/KL_reward/mean": -0.0031645637936890125, + "rewards/KL_reward/std": 0.0041256980039179325, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.014330117031931877, + "rewards/angle_reward/std": 0.7244763374328613, + "rewards/thinking_verbosity_reward/mean": -1.2905654907226562, + "rewards/thinking_verbosity_reward/std": 0.26780593395233154, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 656.6953125, + "epoch": 0.14625850340136054, + "grad_norm": 0.01521074865013361, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 17037444.0, + "rewards/KL_reward/mean": -0.003692830679938197, + "rewards/KL_reward/std": 0.004924299195408821, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.009508412331342697, + "rewards/angle_reward/std": 0.6744556427001953, + "rewards/thinking_verbosity_reward/mean": -1.2567415237426758, + "rewards/thinking_verbosity_reward/std": 0.2456611543893814, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 690.2578125, + "epoch": 0.1473922902494331, + "grad_norm": 0.01670273207128048, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 17157309.0, + "rewards/KL_reward/mean": -0.0034066352527588606, + "rewards/KL_reward/std": 0.0034871476236730814, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.009749175980687141, + "rewards/angle_reward/std": 0.736328661441803, + "rewards/thinking_verbosity_reward/mean": -1.2978801727294922, + "rewards/thinking_verbosity_reward/std": 0.19752177596092224, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 701.7890625, + "epoch": 0.14852607709750568, + "grad_norm": 0.01933622732758522, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 17279162.0, + "rewards/KL_reward/mean": -0.00316535378806293, + "rewards/KL_reward/std": 0.004378271289169788, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": 0.03546326234936714, + "rewards/angle_reward/std": 0.7148266434669495, + "rewards/thinking_verbosity_reward/mean": -1.2958853244781494, + "rewards/thinking_verbosity_reward/std": 0.2705304026603699, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 757.8046875, + "epoch": 0.14965986394557823, + "grad_norm": 0.0137477433308959, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 17408545.0, + "rewards/KL_reward/mean": -0.003397803520783782, + "rewards/KL_reward/std": 0.003958834335207939, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.01723206788301468, + "rewards/angle_reward/std": 0.7112488746643066, + "rewards/thinking_verbosity_reward/mean": -1.3415770530700684, + "rewards/thinking_verbosity_reward/std": 0.3047771751880646, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 637.734375, + "epoch": 0.15079365079365079, + "grad_norm": 0.019227879121899605, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 17521855.0, + "rewards/KL_reward/mean": -0.003851154586300254, + "rewards/KL_reward/std": 0.005098348017781973, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.04486759379506111, + "rewards/angle_reward/std": 0.7175752520561218, + "rewards/thinking_verbosity_reward/mean": -1.2439886331558228, + "rewards/thinking_verbosity_reward/std": 0.21095074713230133, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 698.1953125, + "epoch": 0.15192743764172337, + "grad_norm": 0.017246374860405922, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 17642864.0, + "rewards/KL_reward/mean": -0.0029999513644725084, + "rewards/KL_reward/std": 0.003456530626863241, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.05042729899287224, + "rewards/angle_reward/std": 0.7183257937431335, + "rewards/thinking_verbosity_reward/mean": -1.285423994064331, + "rewards/thinking_verbosity_reward/std": 0.3023035526275635, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 624.7890625, + "epoch": 0.15306122448979592, + "grad_norm": 0.015937460586428642, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 17754965.0, + "rewards/KL_reward/mean": -0.0046004485338926315, + "rewards/KL_reward/std": 0.003918324131518602, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.03442571312189102, + "rewards/angle_reward/std": 0.7067039608955383, + "rewards/thinking_verbosity_reward/mean": -1.225354790687561, + "rewards/thinking_verbosity_reward/std": 0.24156977236270905, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 755.875, + "epoch": 0.15419501133786848, + "grad_norm": 0.016583018004894257, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 17884061.0, + "rewards/KL_reward/mean": -0.0034096338786184788, + "rewards/KL_reward/std": 0.004352725576609373, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "rewards/angle_reward/mean": 0.048420269042253494, + "rewards/angle_reward/std": 0.7189984917640686, + "rewards/thinking_verbosity_reward/mean": -1.3360917568206787, + "rewards/thinking_verbosity_reward/std": 0.3205486834049225, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 751.96875, + "epoch": 0.15532879818594103, + "grad_norm": 0.015360284596681595, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 18012057.0, + "rewards/KL_reward/mean": -0.003828501794487238, + "rewards/KL_reward/std": 0.004502009600400925, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.020650874823331833, + "rewards/angle_reward/std": 0.7309556007385254, + "rewards/thinking_verbosity_reward/mean": -1.337904453277588, + "rewards/thinking_verbosity_reward/std": 0.2968203127384186, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 721.5234375, + "epoch": 0.1564625850340136, + "grad_norm": 0.01785186119377613, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 18136252.0, + "rewards/KL_reward/mean": -0.0030521510634571314, + "rewards/KL_reward/std": 0.004306137096136808, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.02232043817639351, + "rewards/angle_reward/std": 0.7087429761886597, + "rewards/thinking_verbosity_reward/mean": -1.3114829063415527, + "rewards/thinking_verbosity_reward/std": 0.28632381558418274, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 666.9296875, + "epoch": 0.15759637188208617, + "grad_norm": 0.018005847930908203, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 18253931.0, + "rewards/KL_reward/mean": -0.003561299294233322, + "rewards/KL_reward/std": 0.004450182896107435, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.06507096439599991, + "rewards/angle_reward/std": 0.6975307464599609, + "rewards/thinking_verbosity_reward/mean": -1.2706272602081299, + "rewards/thinking_verbosity_reward/std": 0.22487682104110718, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 700.703125, + "epoch": 0.15873015873015872, + "grad_norm": 0.017706632614135742, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 18375693.0, + "rewards/KL_reward/mean": -0.0038045665714889765, + "rewards/KL_reward/std": 0.005178486928343773, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.004316430538892746, + "rewards/angle_reward/std": 0.7525703310966492, + "rewards/thinking_verbosity_reward/mean": -1.2824617624282837, + "rewards/thinking_verbosity_reward/std": 0.32464227080345154, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 741.3125, + "epoch": 0.1598639455782313, + "grad_norm": 0.015213903039693832, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 18502533.0, + "rewards/KL_reward/mean": -0.0034087002277374268, + "rewards/KL_reward/std": 0.003798006335273385, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": -0.1237906813621521, + "rewards/angle_reward/std": 0.7100720405578613, + "rewards/thinking_verbosity_reward/mean": -1.3441178798675537, + "rewards/thinking_verbosity_reward/std": 0.2108469009399414, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 660.578125, + "epoch": 0.16099773242630386, + "grad_norm": 0.017703410238027573, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 18619055.0, + "rewards/KL_reward/mean": -0.004657331854104996, + "rewards/KL_reward/std": 0.005449049174785614, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": 0.0407666377723217, + "rewards/angle_reward/std": 0.6839425563812256, + "rewards/thinking_verbosity_reward/mean": -1.242754578590393, + "rewards/thinking_verbosity_reward/std": 0.3245146870613098, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 637.2421875, + "epoch": 0.1621315192743764, + "grad_norm": 0.015972137451171875, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 18732246.0, + "rewards/KL_reward/mean": -0.004260205198079348, + "rewards/KL_reward/std": 0.004415446892380714, + "rewards/accuracy_reward/mean": 0.359375, + "rewards/accuracy_reward/std": 0.481702595949173, + "rewards/angle_reward/mean": 0.1555885374546051, + "rewards/angle_reward/std": 0.7082699537277222, + "rewards/thinking_verbosity_reward/mean": -1.2461501359939575, + "rewards/thinking_verbosity_reward/std": 0.19471445679664612, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 783.53125, + "epoch": 0.16326530612244897, + "grad_norm": 0.015578354708850384, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 18864938.0, + "rewards/KL_reward/mean": -0.004091382492333651, + "rewards/KL_reward/std": 0.004373244475573301, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.02433072030544281, + "rewards/angle_reward/std": 0.7120692133903503, + "rewards/thinking_verbosity_reward/mean": -1.3740453720092773, + "rewards/thinking_verbosity_reward/std": 0.26230984926223755, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 668.21875, + "epoch": 0.16439909297052155, + "grad_norm": 0.017579572275280952, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 18982822.0, + "rewards/KL_reward/mean": -0.003975247032940388, + "rewards/KL_reward/std": 0.004776740446686745, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.09519442915916443, + "rewards/angle_reward/std": 0.7148767113685608, + "rewards/thinking_verbosity_reward/mean": -1.2664110660552979, + "rewards/thinking_verbosity_reward/std": 0.25450992584228516, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 661.7890625, + "epoch": 0.1655328798185941, + "grad_norm": 0.01608235016465187, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 19100075.0, + "rewards/KL_reward/mean": -0.005266926251351833, + "rewards/KL_reward/std": 0.006609444040805101, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.1746762990951538, + "rewards/angle_reward/mean": -0.028426187112927437, + "rewards/angle_reward/std": 0.697603702545166, + "rewards/thinking_verbosity_reward/mean": -1.2481904029846191, + "rewards/thinking_verbosity_reward/std": 0.30778783559799194, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 742.0625, + "epoch": 0.16666666666666666, + "grad_norm": 0.01791374199092388, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 19227435.0, + "rewards/KL_reward/mean": -0.003503902815282345, + "rewards/KL_reward/std": 0.003946464508771896, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.04175577685236931, + "rewards/angle_reward/std": 0.7005606293678284, + "rewards/thinking_verbosity_reward/mean": -1.3384881019592285, + "rewards/thinking_verbosity_reward/std": 0.2482069581747055, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 716.9453125, + "epoch": 0.16780045351473924, + "grad_norm": 0.014346621930599213, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 19351300.0, + "rewards/KL_reward/mean": -0.003948048688471317, + "rewards/KL_reward/std": 0.004674157593399286, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": 0.03262189030647278, + "rewards/angle_reward/std": 0.6847970485687256, + "rewards/thinking_verbosity_reward/mean": -1.2974423170089722, + "rewards/thinking_verbosity_reward/std": 0.3276059031486511, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 691.6875, + "epoch": 0.1689342403628118, + "grad_norm": 0.018055735155940056, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 19471884.0, + "rewards/KL_reward/mean": -0.0039238715544342995, + "rewards/KL_reward/std": 0.005335812456905842, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.05350875109434128, + "rewards/angle_reward/std": 0.701145350933075, + "rewards/thinking_verbosity_reward/mean": -1.2935394048690796, + "rewards/thinking_verbosity_reward/std": 0.23215413093566895, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 685.421875, + "epoch": 0.17006802721088435, + "grad_norm": 0.013902511447668076, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 19591802.0, + "rewards/KL_reward/mean": -0.003565979190170765, + "rewards/KL_reward/std": 0.004245271440595388, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.06707803905010223, + "rewards/angle_reward/std": 0.6980970501899719, + "rewards/thinking_verbosity_reward/mean": -1.2839555740356445, + "rewards/thinking_verbosity_reward/std": 0.25100815296173096, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 605.4609375, + "epoch": 0.1712018140589569, + "grad_norm": 0.018808528780937195, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 19701733.0, + "rewards/KL_reward/mean": -0.003445668611675501, + "rewards/KL_reward/std": 0.004941299092024565, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.0014398042112588882, + "rewards/angle_reward/std": 0.7135060429573059, + "rewards/thinking_verbosity_reward/mean": -1.2097773551940918, + "rewards/thinking_verbosity_reward/std": 0.21896615624427795, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 702.46875, + "epoch": 0.17233560090702948, + "grad_norm": 0.017743032425642014, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 19822961.0, + "rewards/KL_reward/mean": -0.0039419992826879025, + "rewards/KL_reward/std": 0.005640941672027111, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.006016073748469353, + "rewards/angle_reward/std": 0.7430128455162048, + "rewards/thinking_verbosity_reward/mean": -1.291778802871704, + "rewards/thinking_verbosity_reward/std": 0.2926243841648102, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 710.6796875, + "epoch": 0.17346938775510204, + "grad_norm": 0.017250889912247658, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 19945792.0, + "rewards/KL_reward/mean": -0.004069920629262924, + "rewards/KL_reward/std": 0.004257708787918091, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": -0.1506166160106659, + "rewards/angle_reward/std": 0.7420839667320251, + "rewards/thinking_verbosity_reward/mean": -1.308785319328308, + "rewards/thinking_verbosity_reward/std": 0.24828357994556427, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 676.5234375, + "epoch": 0.1746031746031746, + "grad_norm": 0.017633303999900818, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20064731.0, + "rewards/KL_reward/mean": -0.004391741007566452, + "rewards/KL_reward/std": 0.005697107408195734, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.04565593600273132, + "rewards/angle_reward/std": 0.7310495376586914, + "rewards/thinking_verbosity_reward/mean": -1.278063416481018, + "rewards/thinking_verbosity_reward/std": 0.23613341152668, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 623.9140625, + "epoch": 0.17573696145124718, + "grad_norm": 0.019788647070527077, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20176464.0, + "rewards/KL_reward/mean": -0.0035868592094630003, + "rewards/KL_reward/std": 0.004520603455603123, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": -0.09205663204193115, + "rewards/angle_reward/std": 0.7439991235733032, + "rewards/thinking_verbosity_reward/mean": -1.2228164672851562, + "rewards/thinking_verbosity_reward/std": 0.24998639523983002, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 676.6640625, + "epoch": 0.17687074829931973, + "grad_norm": 0.022692374885082245, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20294669.0, + "rewards/KL_reward/mean": -0.005349142476916313, + "rewards/KL_reward/std": 0.006089122965931892, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "rewards/angle_reward/mean": 0.08410148322582245, + "rewards/angle_reward/std": 0.7432141304016113, + "rewards/thinking_verbosity_reward/mean": -1.2565858364105225, + "rewards/thinking_verbosity_reward/std": 0.3331960439682007, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 682.1015625, + "epoch": 0.17800453514739228, + "grad_norm": 0.017600808292627335, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 20414010.0, + "rewards/KL_reward/mean": -0.004695170558989048, + "rewards/KL_reward/std": 0.004911729600280523, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": 0.012458901852369308, + "rewards/angle_reward/std": 0.7319358587265015, + "rewards/thinking_verbosity_reward/mean": -1.2779560089111328, + "rewards/thinking_verbosity_reward/std": 0.26474660634994507, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 633.6875, + "epoch": 0.17913832199546487, + "grad_norm": 0.016389839351177216, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20527138.0, + "rewards/KL_reward/mean": -0.004976123105734587, + "rewards/KL_reward/std": 0.005367509555071592, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.016020849347114563, + "rewards/angle_reward/std": 0.7215960025787354, + "rewards/thinking_verbosity_reward/mean": -1.2339168787002563, + "rewards/thinking_verbosity_reward/std": 0.24415971338748932, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 627.59375, + "epoch": 0.18027210884353742, + "grad_norm": 0.018503598868846893, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 20639710.0, + "rewards/KL_reward/mean": -0.004522197414189577, + "rewards/KL_reward/std": 0.0054156603291630745, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.027919773012399673, + "rewards/angle_reward/std": 0.6923909783363342, + "rewards/thinking_verbosity_reward/mean": -1.2318589687347412, + "rewards/thinking_verbosity_reward/std": 0.22215832769870758, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 624.3046875, + "epoch": 0.18140589569160998, + "grad_norm": 0.019150318577885628, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20751677.0, + "rewards/KL_reward/mean": -0.005446942523121834, + "rewards/KL_reward/std": 0.005401700735092163, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": -0.03322335332632065, + "rewards/angle_reward/std": 0.7266063094139099, + "rewards/thinking_verbosity_reward/mean": -1.225290298461914, + "rewards/thinking_verbosity_reward/std": 0.23961004614830017, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 699.578125, + "epoch": 0.18253968253968253, + "grad_norm": 0.016021663323044777, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 20873047.0, + "rewards/KL_reward/mean": -0.003972027916461229, + "rewards/KL_reward/std": 0.00455878023058176, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "rewards/angle_reward/mean": 0.03642219305038452, + "rewards/angle_reward/std": 0.7052026391029358, + "rewards/thinking_verbosity_reward/mean": -1.2939989566802979, + "rewards/thinking_verbosity_reward/std": 0.2693682610988617, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 755.7421875, + "epoch": 0.1836734693877551, + "grad_norm": 0.015306858345866203, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 21001646.0, + "rewards/KL_reward/mean": -0.004739418625831604, + "rewards/KL_reward/std": 0.004921747837215662, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.029500054195523262, + "rewards/angle_reward/std": 0.7210052609443665, + "rewards/thinking_verbosity_reward/mean": -1.3510175943374634, + "rewards/thinking_verbosity_reward/std": 0.249182790517807, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 628.515625, + "epoch": 0.18480725623582767, + "grad_norm": 0.01611776277422905, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 21114136.0, + "rewards/KL_reward/mean": -0.005245381500571966, + "rewards/KL_reward/std": 0.006099026184529066, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.05297985300421715, + "rewards/angle_reward/std": 0.66853266954422, + "rewards/thinking_verbosity_reward/mean": -1.2296866178512573, + "rewards/thinking_verbosity_reward/std": 0.23880408704280853, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 752.7109375, + "epoch": 0.18594104308390022, + "grad_norm": 0.0170980766415596, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 21242723.0, + "rewards/KL_reward/mean": -0.004587255418300629, + "rewards/KL_reward/std": 0.005099593196064234, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": 0.020532388240098953, + "rewards/angle_reward/std": 0.7177428603172302, + "rewards/thinking_verbosity_reward/mean": -1.3383744955062866, + "rewards/thinking_verbosity_reward/std": 0.29780030250549316, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 668.3125, + "epoch": 0.1870748299319728, + "grad_norm": 0.018160531297326088, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 21360507.0, + "rewards/KL_reward/mean": -0.00526096997782588, + "rewards/KL_reward/std": 0.004359393380582333, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.03587224334478378, + "rewards/angle_reward/std": 0.6858236789703369, + "rewards/thinking_verbosity_reward/mean": -1.2683125734329224, + "rewards/thinking_verbosity_reward/std": 0.24522313475608826, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 716.9375, + "epoch": 0.18820861678004536, + "grad_norm": 0.017265858128666878, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 21484555.0, + "rewards/KL_reward/mean": -0.006219521164894104, + "rewards/KL_reward/std": 0.008716563694179058, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.043499838560819626, + "rewards/angle_reward/std": 0.7223663330078125, + "rewards/thinking_verbosity_reward/mean": -1.2908995151519775, + "rewards/thinking_verbosity_reward/std": 0.3525276184082031, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 781.984375, + "epoch": 0.1893424036281179, + "grad_norm": 0.01280480157583952, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 21616745.0, + "rewards/KL_reward/mean": -0.005252651404589415, + "rewards/KL_reward/std": 0.004878875333815813, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.02107461914420128, + "rewards/angle_reward/std": 0.6955547332763672, + "rewards/thinking_verbosity_reward/mean": -1.3725743293762207, + "rewards/thinking_verbosity_reward/std": 0.2627544105052948, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 649.390625, + "epoch": 0.19047619047619047, + "grad_norm": 0.01947556994855404, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 21732139.0, + "rewards/KL_reward/mean": -0.0048250844702124596, + "rewards/KL_reward/std": 0.004948804154992104, + "rewards/accuracy_reward/mean": 0.3828125, + "rewards/accuracy_reward/std": 0.4879830479621887, + "rewards/angle_reward/mean": 0.02219543047249317, + "rewards/angle_reward/std": 0.6966620087623596, + "rewards/thinking_verbosity_reward/mean": -1.2455801963806152, + "rewards/thinking_verbosity_reward/std": 0.26460281014442444, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 709.6171875, + "epoch": 0.19160997732426305, + "grad_norm": 0.017735688015818596, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 21855034.0, + "rewards/KL_reward/mean": -0.00399676663801074, + "rewards/KL_reward/std": 0.003837845753878355, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.06983543187379837, + "rewards/angle_reward/std": 0.7073096632957458, + "rewards/thinking_verbosity_reward/mean": -1.3072407245635986, + "rewards/thinking_verbosity_reward/std": 0.2513129711151123, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 671.9921875, + "epoch": 0.1927437641723356, + "grad_norm": 0.016666453331708908, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 21973249.0, + "rewards/KL_reward/mean": -0.0056351423263549805, + "rewards/KL_reward/std": 0.006115090101957321, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.025297701358795166, + "rewards/angle_reward/std": 0.7012860774993896, + "rewards/thinking_verbosity_reward/mean": -1.277807593345642, + "rewards/thinking_verbosity_reward/std": 0.21199442446231842, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 709.609375, + "epoch": 0.19387755102040816, + "grad_norm": 0.017414640635252, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22096039.0, + "rewards/KL_reward/mean": -0.005263416562229395, + "rewards/KL_reward/std": 0.005779444705694914, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": -0.02822297252714634, + "rewards/angle_reward/std": 0.7202067971229553, + "rewards/thinking_verbosity_reward/mean": -1.306566834449768, + "rewards/thinking_verbosity_reward/std": 0.25482043623924255, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 813.4609375, + "epoch": 0.19501133786848074, + "grad_norm": 0.016978643834590912, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22232370.0, + "rewards/KL_reward/mean": -0.0045533087104558945, + "rewards/KL_reward/std": 0.004777958616614342, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.09315788745880127, + "rewards/angle_reward/std": 0.7188766598701477, + "rewards/thinking_verbosity_reward/mean": -1.3977937698364258, + "rewards/thinking_verbosity_reward/std": 0.2791663706302643, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 627.953125, + "epoch": 0.1961451247165533, + "grad_norm": 0.01868283748626709, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 22345340.0, + "rewards/KL_reward/mean": -0.005546521861106157, + "rewards/KL_reward/std": 0.005959734786301851, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.02303946577012539, + "rewards/angle_reward/std": 0.7035124897956848, + "rewards/thinking_verbosity_reward/mean": -1.232616662979126, + "rewards/thinking_verbosity_reward/std": 0.2200835943222046, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 660.578125, + "epoch": 0.19727891156462585, + "grad_norm": 0.01470309216529131, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 22461734.0, + "rewards/KL_reward/mean": -0.005267022177577019, + "rewards/KL_reward/std": 0.005062570795416832, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.0618949793279171, + "rewards/angle_reward/std": 0.6813735365867615, + "rewards/thinking_verbosity_reward/mean": -1.2643492221832275, + "rewards/thinking_verbosity_reward/std": 0.2252192497253418, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 633.5546875, + "epoch": 0.1984126984126984, + "grad_norm": 0.021716872230172157, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22574661.0, + "rewards/KL_reward/mean": -0.006059582345187664, + "rewards/KL_reward/std": 0.007729161065071821, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.016686338931322098, + "rewards/angle_reward/std": 0.7589618563652039, + "rewards/thinking_verbosity_reward/mean": -1.2372753620147705, + "rewards/thinking_verbosity_reward/std": 0.2257297784090042, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 672.90625, + "epoch": 0.19954648526077098, + "grad_norm": 0.016434574499726295, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22692505.0, + "rewards/KL_reward/mean": -0.006249883212149143, + "rewards/KL_reward/std": 0.00694391829892993, + "rewards/accuracy_reward/mean": 0.2578125, + "rewards/accuracy_reward/std": 0.43914902210235596, + "rewards/angle_reward/mean": 0.04599640890955925, + "rewards/angle_reward/std": 0.6874366998672485, + "rewards/thinking_verbosity_reward/mean": -1.2711100578308105, + "rewards/thinking_verbosity_reward/std": 0.25402602553367615, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 714.984375, + "epoch": 0.20068027210884354, + "grad_norm": 0.016488516703248024, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22815759.0, + "rewards/KL_reward/mean": -0.0067267632111907005, + "rewards/KL_reward/std": 0.006903097033500671, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": 0.03039107471704483, + "rewards/angle_reward/std": 0.7145715951919556, + "rewards/thinking_verbosity_reward/mean": -1.3076138496398926, + "rewards/thinking_verbosity_reward/std": 0.2750972509384155, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 678.3046875, + "epoch": 0.2018140589569161, + "grad_norm": 0.018575025722384453, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 22934798.0, + "rewards/KL_reward/mean": -0.005837662611156702, + "rewards/KL_reward/std": 0.005946184508502483, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.0672629326581955, + "rewards/angle_reward/std": 0.71257084608078, + "rewards/thinking_verbosity_reward/mean": -1.2798986434936523, + "rewards/thinking_verbosity_reward/std": 0.23570102453231812, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 694.5703125, + "epoch": 0.20294784580498867, + "grad_norm": 0.016843745484948158, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 23055239.0, + "rewards/KL_reward/mean": -0.005394851788878441, + "rewards/KL_reward/std": 0.005620879586786032, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "rewards/angle_reward/mean": -0.0011367611587047577, + "rewards/angle_reward/std": 0.7321102619171143, + "rewards/thinking_verbosity_reward/mean": -1.2880403995513916, + "rewards/thinking_verbosity_reward/std": 0.2747134864330292, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 594.6640625, + "epoch": 0.20408163265306123, + "grad_norm": 0.018888045102357864, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 23162676.0, + "rewards/KL_reward/mean": -0.006592948921024799, + "rewards/KL_reward/std": 0.0066827163100242615, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "rewards/angle_reward/mean": 0.06055489182472229, + "rewards/angle_reward/std": 0.7142935395240784, + "rewards/thinking_verbosity_reward/mean": -1.1961841583251953, + "rewards/thinking_verbosity_reward/std": 0.23169764876365662, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 712.2578125, + "epoch": 0.20521541950113378, + "grad_norm": 0.01747574843466282, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 23286325.0, + "rewards/KL_reward/mean": -0.006673584692180157, + "rewards/KL_reward/std": 0.006165490951389074, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": 0.03159511089324951, + "rewards/angle_reward/std": 0.6844731569290161, + "rewards/thinking_verbosity_reward/mean": -1.304220199584961, + "rewards/thinking_verbosity_reward/std": 0.2787870168685913, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 577.1796875, + "epoch": 0.20634920634920634, + "grad_norm": 0.02173023857176304, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 23392516.0, + "rewards/KL_reward/mean": -0.007380689959973097, + "rewards/KL_reward/std": 0.005968100391328335, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": 0.017968900501728058, + "rewards/angle_reward/std": 0.7066680192947388, + "rewards/thinking_verbosity_reward/mean": -1.1748523712158203, + "rewards/thinking_verbosity_reward/std": 0.24626189470291138, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 620.234375, + "epoch": 0.20748299319727892, + "grad_norm": 0.015387197025120258, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 23503202.0, + "rewards/KL_reward/mean": -0.004902666434645653, + "rewards/KL_reward/std": 0.005735761020332575, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": 0.07241170853376389, + "rewards/angle_reward/std": 0.6873176693916321, + "rewards/thinking_verbosity_reward/mean": -1.226454496383667, + "rewards/thinking_verbosity_reward/std": 0.21028883755207062, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 694.9609375, + "epoch": 0.20861678004535147, + "grad_norm": 0.01568688452243805, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 23623917.0, + "rewards/KL_reward/mean": -0.005722587462514639, + "rewards/KL_reward/std": 0.006160435266792774, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": 0.00042659416794776917, + "rewards/angle_reward/std": 0.709008514881134, + "rewards/thinking_verbosity_reward/mean": -1.2791794538497925, + "rewards/thinking_verbosity_reward/std": 0.3151502311229706, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 724.484375, + "epoch": 0.20975056689342403, + "grad_norm": 0.019990328699350357, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 23748843.0, + "rewards/KL_reward/mean": -0.005906912498176098, + "rewards/KL_reward/std": 0.005887902807444334, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.06703461706638336, + "rewards/angle_reward/std": 0.7245620489120483, + "rewards/thinking_verbosity_reward/mean": -1.2944716215133667, + "rewards/thinking_verbosity_reward/std": 0.3659032881259918, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 657.5703125, + "epoch": 0.2108843537414966, + "grad_norm": 0.016637342050671577, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 23864540.0, + "rewards/KL_reward/mean": -0.006674261763691902, + "rewards/KL_reward/std": 0.006491075269877911, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.02003803290426731, + "rewards/angle_reward/std": 0.7134320735931396, + "rewards/thinking_verbosity_reward/mean": -1.2597732543945312, + "rewards/thinking_verbosity_reward/std": 0.23402182757854462, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 673.4453125, + "epoch": 0.21201814058956917, + "grad_norm": 0.01620086468756199, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 23982269.0, + "rewards/KL_reward/mean": -0.007163557223975658, + "rewards/KL_reward/std": 0.006470571272075176, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.12528406083583832, + "rewards/angle_reward/std": 0.7103714942932129, + "rewards/thinking_verbosity_reward/mean": -1.2723393440246582, + "rewards/thinking_verbosity_reward/std": 0.25051143765449524, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 644.0234375, + "epoch": 0.21315192743764172, + "grad_norm": 0.019968464970588684, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24097072.0, + "rewards/KL_reward/mean": -0.0054155196994543076, + "rewards/KL_reward/std": 0.006106829270720482, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.005951676517724991, + "rewards/angle_reward/std": 0.7312422394752502, + "rewards/thinking_verbosity_reward/mean": -1.2448339462280273, + "rewards/thinking_verbosity_reward/std": 0.2416672259569168, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 635.1796875, + "epoch": 0.21428571428571427, + "grad_norm": 0.018636178225278854, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24209991.0, + "rewards/KL_reward/mean": -0.006905479356646538, + "rewards/KL_reward/std": 0.007862304337322712, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": -0.023817699402570724, + "rewards/angle_reward/std": 0.7048460841178894, + "rewards/thinking_verbosity_reward/mean": -1.231690764427185, + "rewards/thinking_verbosity_reward/std": 0.2624240815639496, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 660.0546875, + "epoch": 0.21541950113378686, + "grad_norm": 0.018882103264331818, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 24326934.0, + "rewards/KL_reward/mean": -0.006683792918920517, + "rewards/KL_reward/std": 0.007892372086644173, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.006928920745849609, + "rewards/angle_reward/std": 0.7000882029533386, + "rewards/thinking_verbosity_reward/mean": -1.244320034980774, + "rewards/thinking_verbosity_reward/std": 0.3162398934364319, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 713.0703125, + "epoch": 0.2165532879818594, + "grad_norm": 0.01985430158674717, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24449231.0, + "rewards/KL_reward/mean": -0.006377742625772953, + "rewards/KL_reward/std": 0.006911171600222588, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": -0.02648763358592987, + "rewards/angle_reward/std": 0.7328572869300842, + "rewards/thinking_verbosity_reward/mean": -1.3061573505401611, + "rewards/thinking_verbosity_reward/std": 0.273336261510849, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 650.5546875, + "epoch": 0.21768707482993196, + "grad_norm": 0.02458028309047222, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24564926.0, + "rewards/KL_reward/mean": -0.005544313229620457, + "rewards/KL_reward/std": 0.005503201391547918, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.05760467052459717, + "rewards/angle_reward/std": 0.7331093549728394, + "rewards/thinking_verbosity_reward/mean": -1.2320880889892578, + "rewards/thinking_verbosity_reward/std": 0.32661738991737366, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 669.6953125, + "epoch": 0.21882086167800455, + "grad_norm": 0.01643040031194687, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24682391.0, + "rewards/KL_reward/mean": -0.006706792861223221, + "rewards/KL_reward/std": 0.007362376432865858, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.03348740190267563, + "rewards/angle_reward/std": 0.7014281749725342, + "rewards/thinking_verbosity_reward/mean": -1.2702393531799316, + "rewards/thinking_verbosity_reward/std": 0.24225826561450958, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 633.5234375, + "epoch": 0.2199546485260771, + "grad_norm": 0.018671313300728798, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24795274.0, + "rewards/KL_reward/mean": -0.0069778538309037685, + "rewards/KL_reward/std": 0.007189198397099972, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.0068779680877923965, + "rewards/angle_reward/std": 0.7145563364028931, + "rewards/thinking_verbosity_reward/mean": -1.2367829084396362, + "rewards/thinking_verbosity_reward/std": 0.22817501425743103, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 744.015625, + "epoch": 0.22108843537414966, + "grad_norm": 0.01673026755452156, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 24922652.0, + "rewards/KL_reward/mean": -0.005742833949625492, + "rewards/KL_reward/std": 0.005718303844332695, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.051033779978752136, + "rewards/angle_reward/std": 0.7201513051986694, + "rewards/thinking_verbosity_reward/mean": -1.3313870429992676, + "rewards/thinking_verbosity_reward/std": 0.2926025390625, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 608.9765625, + "epoch": 0.2222222222222222, + "grad_norm": 0.01862356625497341, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25032849.0, + "rewards/KL_reward/mean": -0.006777351722121239, + "rewards/KL_reward/std": 0.007024673279374838, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": -0.097476065158844, + "rewards/angle_reward/std": 0.7260298728942871, + "rewards/thinking_verbosity_reward/mean": -1.206028699874878, + "rewards/thinking_verbosity_reward/std": 0.25673383474349976, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 657.9921875, + "epoch": 0.2233560090702948, + "grad_norm": 0.020741797983646393, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25149072.0, + "rewards/KL_reward/mean": -0.007296534720808268, + "rewards/KL_reward/std": 0.00537205720320344, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.03954760730266571, + "rewards/angle_reward/std": 0.6764101982116699, + "rewards/thinking_verbosity_reward/mean": -1.2598237991333008, + "rewards/thinking_verbosity_reward/std": 0.23613522946834564, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 728.046875, + "epoch": 0.22448979591836735, + "grad_norm": 0.01808641105890274, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 25274166.0, + "rewards/KL_reward/mean": -0.005314648151397705, + "rewards/KL_reward/std": 0.005181093234568834, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "rewards/angle_reward/mean": -0.05089155212044716, + "rewards/angle_reward/std": 0.6896182894706726, + "rewards/thinking_verbosity_reward/mean": -1.3157517910003662, + "rewards/thinking_verbosity_reward/std": 0.29521846771240234, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 676.8828125, + "epoch": 0.2256235827664399, + "grad_norm": 0.018155431374907494, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25392647.0, + "rewards/KL_reward/mean": -0.0088615408167243, + "rewards/KL_reward/std": 0.008958039805293083, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.017155885696411133, + "rewards/angle_reward/std": 0.7221205234527588, + "rewards/thinking_verbosity_reward/mean": -1.254866600036621, + "rewards/thinking_verbosity_reward/std": 0.3405318260192871, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 631.78125, + "epoch": 0.22675736961451248, + "grad_norm": 0.01933540403842926, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25505107.0, + "rewards/KL_reward/mean": -0.007805454544723034, + "rewards/KL_reward/std": 0.006577686406672001, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": 0.09517930448055267, + "rewards/angle_reward/std": 0.7301434278488159, + "rewards/thinking_verbosity_reward/mean": -1.2178688049316406, + "rewards/thinking_verbosity_reward/std": 0.30729082226753235, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 627.984375, + "epoch": 0.22789115646258504, + "grad_norm": 0.020539090037345886, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 25618113.0, + "rewards/KL_reward/mean": -0.00769702298566699, + "rewards/KL_reward/std": 0.005597667768597603, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.05146237462759018, + "rewards/angle_reward/std": 0.7511812448501587, + "rewards/thinking_verbosity_reward/mean": -1.2318042516708374, + "rewards/thinking_verbosity_reward/std": 0.22466543316841125, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 630.4140625, + "epoch": 0.2290249433106576, + "grad_norm": 0.019574997946619987, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 25730702.0, + "rewards/KL_reward/mean": -0.00823692511767149, + "rewards/KL_reward/std": 0.010290653444826603, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": -0.009376266971230507, + "rewards/angle_reward/std": 0.7389089465141296, + "rewards/thinking_verbosity_reward/mean": -1.2259137630462646, + "rewards/thinking_verbosity_reward/std": 0.26688382029533386, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 568.375, + "epoch": 0.23015873015873015, + "grad_norm": 0.021541284397244453, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 25835294.0, + "rewards/KL_reward/mean": -0.008544719778001308, + "rewards/KL_reward/std": 0.007308653090149164, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.06592728942632675, + "rewards/angle_reward/std": 0.7116559743881226, + "rewards/thinking_verbosity_reward/mean": -1.162217140197754, + "rewards/thinking_verbosity_reward/std": 0.26111775636672974, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 715.265625, + "epoch": 0.23129251700680273, + "grad_norm": 0.01561558898538351, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 25958592.0, + "rewards/KL_reward/mean": -0.006122298073023558, + "rewards/KL_reward/std": 0.005869515240192413, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": -0.055072057992219925, + "rewards/angle_reward/std": 0.7151932716369629, + "rewards/thinking_verbosity_reward/mean": -1.3064992427825928, + "rewards/thinking_verbosity_reward/std": 0.28168007731437683, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 712.671875, + "epoch": 0.23242630385487528, + "grad_norm": 0.019142666831612587, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 26081982.0, + "rewards/KL_reward/mean": -0.006161075085401535, + "rewards/KL_reward/std": 0.006036452483385801, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.04611802101135254, + "rewards/angle_reward/std": 0.7256365418434143, + "rewards/thinking_verbosity_reward/mean": -1.3043410778045654, + "rewards/thinking_verbosity_reward/std": 0.28012052178382874, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 730.2421875, + "epoch": 0.23356009070294784, + "grad_norm": 0.016479630023241043, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 26207229.0, + "rewards/KL_reward/mean": -0.00602799654006958, + "rewards/KL_reward/std": 0.0058203889057040215, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.03317299112677574, + "rewards/angle_reward/std": 0.6820391416549683, + "rewards/thinking_verbosity_reward/mean": -1.3224318027496338, + "rewards/thinking_verbosity_reward/std": 0.2736136019229889, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 738.328125, + "epoch": 0.23469387755102042, + "grad_norm": 0.015681680291891098, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 26333447.0, + "rewards/KL_reward/mean": -0.005454889498651028, + "rewards/KL_reward/std": 0.00478137843310833, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": 0.024364903569221497, + "rewards/angle_reward/std": 0.7025476098060608, + "rewards/thinking_verbosity_reward/mean": -1.340835452079773, + "rewards/thinking_verbosity_reward/std": 0.21396200358867645, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 587.4765625, + "epoch": 0.23582766439909297, + "grad_norm": 0.019554290920495987, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 26440188.0, + "rewards/KL_reward/mean": -0.006931736133992672, + "rewards/KL_reward/std": 0.005958786234259605, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.03052525222301483, + "rewards/angle_reward/std": 0.7194740772247314, + "rewards/thinking_verbosity_reward/mean": -1.1897162199020386, + "rewards/thinking_verbosity_reward/std": 0.22624364495277405, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 592.21875, + "epoch": 0.23696145124716553, + "grad_norm": 0.020028211176395416, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 26548352.0, + "rewards/KL_reward/mean": -0.007310900837182999, + "rewards/KL_reward/std": 0.00627494789659977, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.03306598216295242, + "rewards/angle_reward/std": 0.7165825366973877, + "rewards/thinking_verbosity_reward/mean": -1.190523386001587, + "rewards/thinking_verbosity_reward/std": 0.24726401269435883, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 634.4140625, + "epoch": 0.23809523809523808, + "grad_norm": 0.015789972618222237, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 26661957.0, + "rewards/KL_reward/mean": -0.007268001325428486, + "rewards/KL_reward/std": 0.006919928826391697, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": -0.01383179146796465, + "rewards/angle_reward/std": 0.6974374651908875, + "rewards/thinking_verbosity_reward/mean": -1.238418698310852, + "rewards/thinking_verbosity_reward/std": 0.22415798902511597, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 711.7421875, + "epoch": 0.23922902494331066, + "grad_norm": 0.016343258321285248, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 26785052.0, + "rewards/KL_reward/mean": -0.00694778747856617, + "rewards/KL_reward/std": 0.0060117426328361034, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": 0.012593336403369904, + "rewards/angle_reward/std": 0.7179101705551147, + "rewards/thinking_verbosity_reward/mean": -1.2946412563323975, + "rewards/thinking_verbosity_reward/std": 0.31867337226867676, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 576.5, + "epoch": 0.24036281179138322, + "grad_norm": 0.01915968768298626, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 26890964.0, + "rewards/KL_reward/mean": -0.0069218226708471775, + "rewards/KL_reward/std": 0.0060286265797913074, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.041994426399469376, + "rewards/angle_reward/std": 0.7082900404930115, + "rewards/thinking_verbosity_reward/mean": -1.1769347190856934, + "rewards/thinking_verbosity_reward/std": 0.23232969641685486, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 584.6875, + "epoch": 0.24149659863945577, + "grad_norm": 0.02144310250878334, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 26997972.0, + "rewards/KL_reward/mean": -0.00726801622658968, + "rewards/KL_reward/std": 0.006610418204218149, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.05485362187027931, + "rewards/angle_reward/std": 0.6939930319786072, + "rewards/thinking_verbosity_reward/mean": -1.1847119331359863, + "rewards/thinking_verbosity_reward/std": 0.2368851751089096, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 699.6015625, + "epoch": 0.24263038548752835, + "grad_norm": 0.018099481239914894, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27119977.0, + "rewards/KL_reward/mean": -0.0071920109912753105, + "rewards/KL_reward/std": 0.005825213622301817, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": 0.05808936432003975, + "rewards/angle_reward/std": 0.6950080990791321, + "rewards/thinking_verbosity_reward/mean": -1.2834217548370361, + "rewards/thinking_verbosity_reward/std": 0.3164091110229492, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 672.8984375, + "epoch": 0.2437641723356009, + "grad_norm": 0.01659393310546875, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27238180.0, + "rewards/KL_reward/mean": -0.007104963064193726, + "rewards/KL_reward/std": 0.006704287137836218, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": 0.05779623985290527, + "rewards/angle_reward/std": 0.68217533826828, + "rewards/thinking_verbosity_reward/mean": -1.2643368244171143, + "rewards/thinking_verbosity_reward/std": 0.2861078679561615, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 625.046875, + "epoch": 0.24489795918367346, + "grad_norm": 0.02007698453962803, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27349314.0, + "rewards/KL_reward/mean": -0.008289286866784096, + "rewards/KL_reward/std": 0.007536042481660843, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "rewards/angle_reward/mean": -0.006564207375049591, + "rewards/angle_reward/std": 0.6777621507644653, + "rewards/thinking_verbosity_reward/mean": -1.2202589511871338, + "rewards/thinking_verbosity_reward/std": 0.2676292657852173, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 649.1640625, + "epoch": 0.24603174603174602, + "grad_norm": 0.019388757646083832, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 27464199.0, + "rewards/KL_reward/mean": -0.0074159977957606316, + "rewards/KL_reward/std": 0.007599027827382088, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "rewards/angle_reward/mean": -0.041372545063495636, + "rewards/angle_reward/std": 0.7221158146858215, + "rewards/thinking_verbosity_reward/mean": -1.249146580696106, + "rewards/thinking_verbosity_reward/std": 0.24595995247364044, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 688.09375, + "epoch": 0.2471655328798186, + "grad_norm": 0.020711416378617287, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 27583491.0, + "rewards/KL_reward/mean": -0.008757997304201126, + "rewards/KL_reward/std": 0.008113629184663296, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": 0.049838222563266754, + "rewards/angle_reward/std": 0.7196351885795593, + "rewards/thinking_verbosity_reward/mean": -1.2596156597137451, + "rewards/thinking_verbosity_reward/std": 0.3635037839412689, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 687.9140625, + "epoch": 0.24829931972789115, + "grad_norm": 0.019323069602251053, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27703360.0, + "rewards/KL_reward/mean": -0.006892567500472069, + "rewards/KL_reward/std": 0.006044676527380943, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": -0.059788353741168976, + "rewards/angle_reward/std": 0.734419584274292, + "rewards/thinking_verbosity_reward/mean": -1.2848849296569824, + "rewards/thinking_verbosity_reward/std": 0.25860872864723206, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 574.546875, + "epoch": 0.2494331065759637, + "grad_norm": 0.019302789121866226, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27809030.0, + "rewards/KL_reward/mean": -0.006590262986719608, + "rewards/KL_reward/std": 0.006840670946985483, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "rewards/angle_reward/mean": -0.0019294023513793945, + "rewards/angle_reward/std": 0.7161747217178345, + "rewards/thinking_verbosity_reward/mean": -1.1663904190063477, + "rewards/thinking_verbosity_reward/std": 0.2719900608062744, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 689.09375, + "epoch": 0.25056689342403626, + "grad_norm": 0.01757318712770939, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 27928754.0, + "rewards/KL_reward/mean": -0.007285448722541332, + "rewards/KL_reward/std": 0.006790507584810257, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": 0.013903097249567509, + "rewards/angle_reward/std": 0.7119504809379578, + "rewards/thinking_verbosity_reward/mean": -1.2859375476837158, + "rewards/thinking_verbosity_reward/std": 0.25896739959716797, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 686.34375, + "epoch": 0.25170068027210885, + "grad_norm": 0.01846483163535595, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28048862.0, + "rewards/KL_reward/mean": -0.007576250471174717, + "rewards/KL_reward/std": 0.0076942844316363335, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.010099878534674644, + "rewards/angle_reward/std": 0.7031623125076294, + "rewards/thinking_verbosity_reward/mean": -1.2700605392456055, + "rewards/thinking_verbosity_reward/std": 0.31792449951171875, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 668.4453125, + "epoch": 0.2528344671201814, + "grad_norm": 0.01812918856739998, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28166479.0, + "rewards/KL_reward/mean": -0.0062308646738529205, + "rewards/KL_reward/std": 0.006511870305985212, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": 0.050949279218912125, + "rewards/angle_reward/std": 0.705336332321167, + "rewards/thinking_verbosity_reward/mean": -1.2666432857513428, + "rewards/thinking_verbosity_reward/std": 0.2544286251068115, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 646.8359375, + "epoch": 0.25396825396825395, + "grad_norm": 0.022773467004299164, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 28281346.0, + "rewards/KL_reward/mean": -0.006268291734158993, + "rewards/KL_reward/std": 0.007012519519776106, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "rewards/angle_reward/mean": -0.10227032750844955, + "rewards/angle_reward/std": 0.7190137505531311, + "rewards/thinking_verbosity_reward/mean": -1.248525857925415, + "rewards/thinking_verbosity_reward/std": 0.23709122836589813, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 702.3046875, + "epoch": 0.25510204081632654, + "grad_norm": 0.020715147256851196, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28403489.0, + "rewards/KL_reward/mean": -0.0062208836898207664, + "rewards/KL_reward/std": 0.005619505885988474, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.0026268325746059418, + "rewards/angle_reward/std": 0.7080735564231873, + "rewards/thinking_verbosity_reward/mean": -1.2969403266906738, + "rewards/thinking_verbosity_reward/std": 0.2679326832294464, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 678.3359375, + "epoch": 0.2562358276643991, + "grad_norm": 0.01669652946293354, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28521908.0, + "rewards/KL_reward/mean": -0.006833111867308617, + "rewards/KL_reward/std": 0.006781416945159435, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": -0.009288817644119263, + "rewards/angle_reward/std": 0.711402177810669, + "rewards/thinking_verbosity_reward/mean": -1.2574052810668945, + "rewards/thinking_verbosity_reward/std": 0.33632826805114746, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 662.515625, + "epoch": 0.25736961451247165, + "grad_norm": 0.0172143392264843, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28639022.0, + "rewards/KL_reward/mean": -0.006038540508598089, + "rewards/KL_reward/std": 0.006242460571229458, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.006223671138286591, + "rewards/angle_reward/std": 0.7244890332221985, + "rewards/thinking_verbosity_reward/mean": -1.2689580917358398, + "rewards/thinking_verbosity_reward/std": 0.20937541127204895, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 671.0234375, + "epoch": 0.2585034013605442, + "grad_norm": 0.01777282916009426, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 28756705.0, + "rewards/KL_reward/mean": -0.006924336310476065, + "rewards/KL_reward/std": 0.006979053374379873, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.003234894946217537, + "rewards/angle_reward/std": 0.7178221344947815, + "rewards/thinking_verbosity_reward/mean": -1.2622389793395996, + "rewards/thinking_verbosity_reward/std": 0.2871507406234741, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 714.59375, + "epoch": 0.25963718820861675, + "grad_norm": 0.015465959906578064, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28880077.0, + "rewards/KL_reward/mean": -0.006152710411697626, + "rewards/KL_reward/std": 0.0048719304613769054, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.05258213356137276, + "rewards/angle_reward/std": 0.7208319306373596, + "rewards/thinking_verbosity_reward/mean": -1.3186596632003784, + "rewards/thinking_verbosity_reward/std": 0.21306687593460083, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 623.875, + "epoch": 0.26077097505668934, + "grad_norm": 0.01664859429001808, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 28991597.0, + "rewards/KL_reward/mean": -0.006472930312156677, + "rewards/KL_reward/std": 0.0078015257604420185, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.11813303828239441, + "rewards/angle_reward/std": 0.6910309195518494, + "rewards/thinking_verbosity_reward/mean": -1.2249972820281982, + "rewards/thinking_verbosity_reward/std": 0.23877747356891632, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 656.6640625, + "epoch": 0.2619047619047619, + "grad_norm": 0.014789329841732979, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 29107346.0, + "rewards/KL_reward/mean": -0.00707493769004941, + "rewards/KL_reward/std": 0.007159274537116289, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.018108312040567398, + "rewards/angle_reward/std": 0.6884683966636658, + "rewards/thinking_verbosity_reward/mean": -1.259082555770874, + "rewards/thinking_verbosity_reward/std": 0.2328866571187973, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 664.0390625, + "epoch": 0.26303854875283444, + "grad_norm": 0.01746738702058792, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29223791.0, + "rewards/KL_reward/mean": -0.006290224380791187, + "rewards/KL_reward/std": 0.005722086876630783, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": -0.02040044218301773, + "rewards/angle_reward/std": 0.6894627809524536, + "rewards/thinking_verbosity_reward/mean": -1.2532029151916504, + "rewards/thinking_verbosity_reward/std": 0.2962525188922882, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 690.0078125, + "epoch": 0.264172335600907, + "grad_norm": 0.01679716631770134, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29344472.0, + "rewards/KL_reward/mean": -0.006983471103012562, + "rewards/KL_reward/std": 0.005662387702614069, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": 0.028897471725940704, + "rewards/angle_reward/std": 0.7165486812591553, + "rewards/thinking_verbosity_reward/mean": -1.2830872535705566, + "rewards/thinking_verbosity_reward/std": 0.27706924080848694, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 563.6171875, + "epoch": 0.2653061224489796, + "grad_norm": 0.019079288467764854, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29449143.0, + "rewards/KL_reward/mean": -0.008397059515118599, + "rewards/KL_reward/std": 0.008088743314146996, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.021606430411338806, + "rewards/angle_reward/std": 0.7255200743675232, + "rewards/thinking_verbosity_reward/mean": -1.1630661487579346, + "rewards/thinking_verbosity_reward/std": 0.23281878232955933, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 681.578125, + "epoch": 0.26643990929705214, + "grad_norm": 0.01799463853240013, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 29568177.0, + "rewards/KL_reward/mean": -0.006513974629342556, + "rewards/KL_reward/std": 0.0063631427474319935, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.05794283002614975, + "rewards/angle_reward/std": 0.7300118803977966, + "rewards/thinking_verbosity_reward/mean": -1.268895149230957, + "rewards/thinking_verbosity_reward/std": 0.30339720845222473, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 643.3984375, + "epoch": 0.2675736961451247, + "grad_norm": 0.016560139134526253, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 29682316.0, + "rewards/KL_reward/mean": -0.007001581601798534, + "rewards/KL_reward/std": 0.007529091089963913, + "rewards/accuracy_reward/mean": 0.3515625, + "rewards/accuracy_reward/std": 0.4793342351913452, + "rewards/angle_reward/mean": -0.009729281067848206, + "rewards/angle_reward/std": 0.654257595539093, + "rewards/thinking_verbosity_reward/mean": -1.2387820482254028, + "rewards/thinking_verbosity_reward/std": 0.26832306385040283, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 616.9296875, + "epoch": 0.2687074829931973, + "grad_norm": 0.01996929757297039, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29793427.0, + "rewards/KL_reward/mean": -0.008148357272148132, + "rewards/KL_reward/std": 0.006440743338316679, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": 0.01599818468093872, + "rewards/angle_reward/std": 0.7043137550354004, + "rewards/thinking_verbosity_reward/mean": -1.2252111434936523, + "rewards/thinking_verbosity_reward/std": 0.1973995566368103, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 707.1640625, + "epoch": 0.2698412698412698, + "grad_norm": 0.01589009165763855, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 29916136.0, + "rewards/KL_reward/mean": -0.007304298225790262, + "rewards/KL_reward/std": 0.007028522901237011, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": -0.022989019751548767, + "rewards/angle_reward/std": 0.7293647527694702, + "rewards/thinking_verbosity_reward/mean": -1.2988755702972412, + "rewards/thinking_verbosity_reward/std": 0.28087329864501953, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 608.546875, + "epoch": 0.2709750566893424, + "grad_norm": 0.020093601197004318, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 30025806.0, + "rewards/KL_reward/mean": -0.007500161416828632, + "rewards/KL_reward/std": 0.006967926397919655, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.0615442618727684, + "rewards/angle_reward/std": 0.7171436548233032, + "rewards/thinking_verbosity_reward/mean": -1.215866208076477, + "rewards/thinking_verbosity_reward/std": 0.20203055441379547, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 687.5390625, + "epoch": 0.272108843537415, + "grad_norm": 0.019968822598457336, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 30146115.0, + "rewards/KL_reward/mean": -0.0064809489995241165, + "rewards/KL_reward/std": 0.006810910999774933, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": -0.010318879038095474, + "rewards/angle_reward/std": 0.732631504535675, + "rewards/thinking_verbosity_reward/mean": -1.2788469791412354, + "rewards/thinking_verbosity_reward/std": 0.2855779230594635, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 705.3984375, + "epoch": 0.2732426303854875, + "grad_norm": 0.02054375596344471, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 30268374.0, + "rewards/KL_reward/mean": -0.007403380237519741, + "rewards/KL_reward/std": 0.007512849755585194, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.007134806364774704, + "rewards/angle_reward/std": 0.7200664281845093, + "rewards/thinking_verbosity_reward/mean": -1.2822654247283936, + "rewards/thinking_verbosity_reward/std": 0.34311890602111816, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 656.6171875, + "epoch": 0.2743764172335601, + "grad_norm": 0.018039554357528687, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 30384125.0, + "rewards/KL_reward/mean": -0.008808700367808342, + "rewards/KL_reward/std": 0.007601337973028421, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.059877797961235046, + "rewards/angle_reward/std": 0.6769348978996277, + "rewards/thinking_verbosity_reward/mean": -1.2398130893707275, + "rewards/thinking_verbosity_reward/std": 0.32048264145851135, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 558.1875, + "epoch": 0.2755102040816326, + "grad_norm": 0.01742182858288288, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 30487541.0, + "rewards/KL_reward/mean": -0.009268783032894135, + "rewards/KL_reward/std": 0.009539565071463585, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.01601773127913475, + "rewards/angle_reward/std": 0.7038976550102234, + "rewards/thinking_verbosity_reward/mean": -1.160976767539978, + "rewards/thinking_verbosity_reward/std": 0.21325227618217468, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 638.0546875, + "epoch": 0.2766439909297052, + "grad_norm": 0.02026633732020855, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 30600956.0, + "rewards/KL_reward/mean": -0.007554056588560343, + "rewards/KL_reward/std": 0.007229703012853861, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.194504976272583, + "rewards/angle_reward/mean": -0.03071395494043827, + "rewards/angle_reward/std": 0.7103466987609863, + "rewards/thinking_verbosity_reward/mean": -1.2354111671447754, + "rewards/thinking_verbosity_reward/std": 0.2587624788284302, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 605.96875, + "epoch": 0.2777777777777778, + "grad_norm": 0.02038923092186451, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 30710352.0, + "rewards/KL_reward/mean": -0.008146708831191063, + "rewards/KL_reward/std": 0.008665196597576141, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "rewards/angle_reward/mean": -0.035703569650650024, + "rewards/angle_reward/std": 0.7312946319580078, + "rewards/thinking_verbosity_reward/mean": -1.197398066520691, + "rewards/thinking_verbosity_reward/std": 0.28156572580337524, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 633.4453125, + "epoch": 0.2789115646258503, + "grad_norm": 0.018543001264333725, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 30823161.0, + "rewards/KL_reward/mean": -0.0074415747076272964, + "rewards/KL_reward/std": 0.00573266576975584, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.05781393498182297, + "rewards/angle_reward/std": 0.7158551216125488, + "rewards/thinking_verbosity_reward/mean": -1.2390161752700806, + "rewards/thinking_verbosity_reward/std": 0.2151612788438797, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 643.2421875, + "epoch": 0.2800453514739229, + "grad_norm": 0.01924288645386696, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 30937840.0, + "rewards/KL_reward/mean": -0.007157555781304836, + "rewards/KL_reward/std": 0.00579442223533988, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.005112181417644024, + "rewards/angle_reward/std": 0.7062931656837463, + "rewards/thinking_verbosity_reward/mean": -1.250734806060791, + "rewards/thinking_verbosity_reward/std": 0.20379839837551117, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 614.3046875, + "epoch": 0.2811791383219955, + "grad_norm": 0.02079552412033081, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31048727.0, + "rewards/KL_reward/mean": -0.008803189732134342, + "rewards/KL_reward/std": 0.008303167298436165, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": -0.02736246958374977, + "rewards/angle_reward/std": 0.7156417369842529, + "rewards/thinking_verbosity_reward/mean": -1.2185291051864624, + "rewards/thinking_verbosity_reward/std": 0.22128619253635406, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 686.7734375, + "epoch": 0.282312925170068, + "grad_norm": 0.015999827533960342, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31168642.0, + "rewards/KL_reward/mean": -0.005788822192698717, + "rewards/KL_reward/std": 0.005088903941214085, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": 0.09513027220964432, + "rewards/angle_reward/std": 0.708631694316864, + "rewards/thinking_verbosity_reward/mean": -1.287979006767273, + "rewards/thinking_verbosity_reward/std": 0.2363952398300171, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 676.6953125, + "epoch": 0.2834467120181406, + "grad_norm": 0.016545815393328667, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 31287899.0, + "rewards/KL_reward/mean": -0.007248513400554657, + "rewards/KL_reward/std": 0.007056637667119503, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24301259219646454, + "rewards/angle_reward/mean": 0.050945453345775604, + "rewards/angle_reward/std": 0.6914721131324768, + "rewards/thinking_verbosity_reward/mean": -1.2786433696746826, + "rewards/thinking_verbosity_reward/std": 0.23383289575576782, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 607.78125, + "epoch": 0.28458049886621317, + "grad_norm": 0.01953643001616001, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31397447.0, + "rewards/KL_reward/mean": -0.008097594603896141, + "rewards/KL_reward/std": 0.00676872069016099, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": 0.00671498104929924, + "rewards/angle_reward/std": 0.6774688959121704, + "rewards/thinking_verbosity_reward/mean": -1.2069485187530518, + "rewards/thinking_verbosity_reward/std": 0.24637892842292786, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 727.6953125, + "epoch": 0.2857142857142857, + "grad_norm": 0.015975242480635643, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31522440.0, + "rewards/KL_reward/mean": -0.00571840675547719, + "rewards/KL_reward/std": 0.006018051877617836, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": 0.026372039690613747, + "rewards/angle_reward/std": 0.6953475475311279, + "rewards/thinking_verbosity_reward/mean": -1.3279554843902588, + "rewards/thinking_verbosity_reward/std": 0.23171544075012207, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 671.3671875, + "epoch": 0.2868480725623583, + "grad_norm": 0.017872359603643417, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 31639087.0, + "rewards/KL_reward/mean": -0.006082674488425255, + "rewards/KL_reward/std": 0.0052271937020123005, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.054664481431245804, + "rewards/angle_reward/std": 0.7107526063919067, + "rewards/thinking_verbosity_reward/mean": -1.2752413749694824, + "rewards/thinking_verbosity_reward/std": 0.22371646761894226, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 580.8046875, + "epoch": 0.28798185941043086, + "grad_norm": 0.02044520527124405, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31745310.0, + "rewards/KL_reward/mean": -0.006792373023927212, + "rewards/KL_reward/std": 0.006417332217097282, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": -0.07211057841777802, + "rewards/angle_reward/std": 0.7001871466636658, + "rewards/thinking_verbosity_reward/mean": -1.1845359802246094, + "rewards/thinking_verbosity_reward/std": 0.21612995862960815, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 636.625, + "epoch": 0.2891156462585034, + "grad_norm": 0.018003787845373154, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31858366.0, + "rewards/KL_reward/mean": -0.007784110493957996, + "rewards/KL_reward/std": 0.007525305729359388, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.045843563973903656, + "rewards/angle_reward/std": 0.7304276823997498, + "rewards/thinking_verbosity_reward/mean": -1.2383641004562378, + "rewards/thinking_verbosity_reward/std": 0.2365047037601471, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 630.3046875, + "epoch": 0.29024943310657597, + "grad_norm": 0.01779160089790821, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 31971045.0, + "rewards/KL_reward/mean": -0.006721969693899155, + "rewards/KL_reward/std": 0.00615214416757226, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.04389385133981705, + "rewards/angle_reward/std": 0.6879055500030518, + "rewards/thinking_verbosity_reward/mean": -1.2370407581329346, + "rewards/thinking_verbosity_reward/std": 0.20806531608104706, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 663.0859375, + "epoch": 0.29138321995464855, + "grad_norm": 0.016839314252138138, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 32087152.0, + "rewards/KL_reward/mean": -0.007783721201121807, + "rewards/KL_reward/std": 0.007356081623584032, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": 0.020175354555249214, + "rewards/angle_reward/std": 0.7190738916397095, + "rewards/thinking_verbosity_reward/mean": -1.258847951889038, + "rewards/thinking_verbosity_reward/std": 0.26659345626831055, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 634.453125, + "epoch": 0.2925170068027211, + "grad_norm": 0.018239034339785576, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 32200514.0, + "rewards/KL_reward/mean": -0.007003012113273144, + "rewards/KL_reward/std": 0.006014623213559389, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.010274749249219894, + "rewards/angle_reward/std": 0.6887149214744568, + "rewards/thinking_verbosity_reward/mean": -1.2421730756759644, + "rewards/thinking_verbosity_reward/std": 0.20233437418937683, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 707.8125, + "epoch": 0.29365079365079366, + "grad_norm": 0.014524643309414387, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 32322642.0, + "rewards/KL_reward/mean": -0.005313513800501823, + "rewards/KL_reward/std": 0.005417239386588335, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": 0.004655953496694565, + "rewards/angle_reward/std": 0.7026400566101074, + "rewards/thinking_verbosity_reward/mean": -1.2959966659545898, + "rewards/thinking_verbosity_reward/std": 0.2969035506248474, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 715.4453125, + "epoch": 0.2947845804988662, + "grad_norm": 0.01586010307073593, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 32446371.0, + "rewards/KL_reward/mean": -0.007117181550711393, + "rewards/KL_reward/std": 0.005675325635820627, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": -0.04042618349194527, + "rewards/angle_reward/std": 0.6932578682899475, + "rewards/thinking_verbosity_reward/mean": -1.318619728088379, + "rewards/thinking_verbosity_reward/std": 0.21846811473369598, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 702.765625, + "epoch": 0.29591836734693877, + "grad_norm": 0.017490733414888382, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 32568389.0, + "rewards/KL_reward/mean": -0.006071691866964102, + "rewards/KL_reward/std": 0.0064462595619261265, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.06298032402992249, + "rewards/angle_reward/std": 0.7073873281478882, + "rewards/thinking_verbosity_reward/mean": -1.2703129053115845, + "rewards/thinking_verbosity_reward/std": 0.3767041265964508, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 667.46875, + "epoch": 0.29705215419501135, + "grad_norm": 0.01801498606801033, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 32685761.0, + "rewards/KL_reward/mean": -0.007192197255790234, + "rewards/KL_reward/std": 0.008388747461140156, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.060268156230449677, + "rewards/angle_reward/std": 0.7170228362083435, + "rewards/thinking_verbosity_reward/mean": -1.2398436069488525, + "rewards/thinking_verbosity_reward/std": 0.36049333214759827, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 763.4453125, + "epoch": 0.2981859410430839, + "grad_norm": 0.01567150466144085, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 32815362.0, + "rewards/KL_reward/mean": -0.0055938586592674255, + "rewards/KL_reward/std": 0.006172350607812405, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.07238460332155228, + "rewards/angle_reward/std": 0.7077392339706421, + "rewards/thinking_verbosity_reward/mean": -1.3383076190948486, + "rewards/thinking_verbosity_reward/std": 0.3404884338378906, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 580.0390625, + "epoch": 0.29931972789115646, + "grad_norm": 0.02081012912094593, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 32921879.0, + "rewards/KL_reward/mean": -0.0069402060471475124, + "rewards/KL_reward/std": 0.007032873574644327, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": -0.06987656652927399, + "rewards/angle_reward/std": 0.7268014550209045, + "rewards/thinking_verbosity_reward/mean": -1.1844277381896973, + "rewards/thinking_verbosity_reward/std": 0.2123224288225174, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 759.3125, + "epoch": 0.30045351473922904, + "grad_norm": 0.014871988445520401, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33051487.0, + "rewards/KL_reward/mean": -0.005905265919864178, + "rewards/KL_reward/std": 0.005857815034687519, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": 0.04666680470108986, + "rewards/angle_reward/std": 0.6774764060974121, + "rewards/thinking_verbosity_reward/mean": -1.347033977508545, + "rewards/thinking_verbosity_reward/std": 0.2861498296260834, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 666.671875, + "epoch": 0.30158730158730157, + "grad_norm": 0.019435271620750427, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 33168613.0, + "rewards/KL_reward/mean": -0.007942674681544304, + "rewards/KL_reward/std": 0.006478943396359682, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": 0.03964770957827568, + "rewards/angle_reward/std": 0.7271708250045776, + "rewards/thinking_verbosity_reward/mean": -1.2621772289276123, + "rewards/thinking_verbosity_reward/std": 0.2675977051258087, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 708.4765625, + "epoch": 0.30272108843537415, + "grad_norm": 0.0172012560069561, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 33291626.0, + "rewards/KL_reward/mean": -0.006565619260072708, + "rewards/KL_reward/std": 0.00826906319707632, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": -0.0016555637121200562, + "rewards/angle_reward/std": 0.7259076237678528, + "rewards/thinking_verbosity_reward/mean": -1.3093411922454834, + "rewards/thinking_verbosity_reward/std": 0.23389175534248352, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 673.6796875, + "epoch": 0.30385487528344673, + "grad_norm": 0.017249425873160362, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 33409025.0, + "rewards/KL_reward/mean": -0.005863506346940994, + "rewards/KL_reward/std": 0.006127702537924051, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.00578656792640686, + "rewards/angle_reward/std": 0.7214607000350952, + "rewards/thinking_verbosity_reward/mean": -1.269033432006836, + "rewards/thinking_verbosity_reward/std": 0.2679460644721985, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 633.171875, + "epoch": 0.30498866213151926, + "grad_norm": 0.01901894062757492, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33522199.0, + "rewards/KL_reward/mean": -0.007426909636706114, + "rewards/KL_reward/std": 0.006230643484741449, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.046921081840991974, + "rewards/angle_reward/std": 0.685775876045227, + "rewards/thinking_verbosity_reward/mean": -1.2192673683166504, + "rewards/thinking_verbosity_reward/std": 0.3074667751789093, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 689.234375, + "epoch": 0.30612244897959184, + "grad_norm": 0.01681593805551529, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33642533.0, + "rewards/KL_reward/mean": -0.0071766008622944355, + "rewards/KL_reward/std": 0.006169370375573635, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.05446804687380791, + "rewards/angle_reward/std": 0.6952854990959167, + "rewards/thinking_verbosity_reward/mean": -1.2952487468719482, + "rewards/thinking_verbosity_reward/std": 0.207930326461792, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 661.78125, + "epoch": 0.3072562358276644, + "grad_norm": 0.01800641044974327, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 33759337.0, + "rewards/KL_reward/mean": -0.006021884270012379, + "rewards/KL_reward/std": 0.0057541439309716225, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": -0.02188066579401493, + "rewards/angle_reward/std": 0.6984922885894775, + "rewards/thinking_verbosity_reward/mean": -1.2585150003433228, + "rewards/thinking_verbosity_reward/std": 0.2620096206665039, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 637.9453125, + "epoch": 0.30839002267573695, + "grad_norm": 0.016137458384037018, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33873570.0, + "rewards/KL_reward/mean": -0.0077069224789738655, + "rewards/KL_reward/std": 0.00656564487144351, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.028969956561923027, + "rewards/angle_reward/std": 0.70253586769104, + "rewards/thinking_verbosity_reward/mean": -1.243137240409851, + "rewards/thinking_verbosity_reward/std": 0.21758389472961426, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 707.7265625, + "epoch": 0.30952380952380953, + "grad_norm": 0.01791304349899292, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 33996679.0, + "rewards/KL_reward/mean": -0.005264196544885635, + "rewards/KL_reward/std": 0.005644720047712326, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.0020773429423570633, + "rewards/angle_reward/std": 0.7002083659172058, + "rewards/thinking_verbosity_reward/mean": -1.2859245538711548, + "rewards/thinking_verbosity_reward/std": 0.3378249704837799, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 634.515625, + "epoch": 0.31065759637188206, + "grad_norm": 0.018295975401997566, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 34109641.0, + "rewards/KL_reward/mean": -0.00699696596711874, + "rewards/KL_reward/std": 0.006174146663397551, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.027715960517525673, + "rewards/angle_reward/std": 0.734915018081665, + "rewards/thinking_verbosity_reward/mean": -1.2406665086746216, + "rewards/thinking_verbosity_reward/std": 0.21177084743976593, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 701.734375, + "epoch": 0.31179138321995464, + "grad_norm": 0.01832558587193489, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 34231791.0, + "rewards/KL_reward/mean": -0.0055017475970089436, + "rewards/KL_reward/std": 0.006083634216338396, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": 0.054435499012470245, + "rewards/angle_reward/std": 0.6996498107910156, + "rewards/thinking_verbosity_reward/mean": -1.2995705604553223, + "rewards/thinking_verbosity_reward/std": 0.2517715096473694, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 568.734375, + "epoch": 0.3129251700680272, + "grad_norm": 0.02160707302391529, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 34336997.0, + "rewards/KL_reward/mean": -0.006332578137516975, + "rewards/KL_reward/std": 0.008337794803082943, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.012452768161892891, + "rewards/angle_reward/std": 0.6941329836845398, + "rewards/thinking_verbosity_reward/mean": -1.1567527055740356, + "rewards/thinking_verbosity_reward/std": 0.2861800491809845, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 620.0703125, + "epoch": 0.31405895691609975, + "grad_norm": 0.020352918654680252, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 34448878.0, + "rewards/KL_reward/mean": -0.007475889287889004, + "rewards/KL_reward/std": 0.007057628594338894, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": -0.027694180607795715, + "rewards/angle_reward/std": 0.7493412494659424, + "rewards/thinking_verbosity_reward/mean": -1.2272380590438843, + "rewards/thinking_verbosity_reward/std": 0.20462197065353394, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 690.4375, + "epoch": 0.31519274376417233, + "grad_norm": 0.01698232814669609, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 34569086.0, + "rewards/KL_reward/mean": -0.006635461002588272, + "rewards/KL_reward/std": 0.005810300353914499, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": 0.04517771303653717, + "rewards/angle_reward/std": 0.7285328507423401, + "rewards/thinking_verbosity_reward/mean": -1.288184642791748, + "rewards/thinking_verbosity_reward/std": 0.25428318977355957, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 713.0390625, + "epoch": 0.3163265306122449, + "grad_norm": 0.017586758360266685, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 34692219.0, + "rewards/KL_reward/mean": -0.006374924443662167, + "rewards/KL_reward/std": 0.005685480311512947, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.09032676368951797, + "rewards/angle_reward/std": 0.68100905418396, + "rewards/thinking_verbosity_reward/mean": -1.3132407665252686, + "rewards/thinking_verbosity_reward/std": 0.23650206625461578, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 682.6015625, + "epoch": 0.31746031746031744, + "grad_norm": 0.01601078175008297, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 34811512.0, + "rewards/KL_reward/mean": -0.0063264500349760056, + "rewards/KL_reward/std": 0.006804726552218199, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": -0.014285329729318619, + "rewards/angle_reward/std": 0.7129133343696594, + "rewards/thinking_verbosity_reward/mean": -1.2733216285705566, + "rewards/thinking_verbosity_reward/std": 0.2886287271976471, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 646.5859375, + "epoch": 0.31859410430839, + "grad_norm": 0.018729139119386673, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 34925931.0, + "rewards/KL_reward/mean": -0.0060574933886528015, + "rewards/KL_reward/std": 0.006262896582484245, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "rewards/angle_reward/mean": -0.03853822499513626, + "rewards/angle_reward/std": 0.672224223613739, + "rewards/thinking_verbosity_reward/mean": -1.242337703704834, + "rewards/thinking_verbosity_reward/std": 0.26664164662361145, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 756.6640625, + "epoch": 0.3197278911564626, + "grad_norm": 0.01873278245329857, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 35054304.0, + "rewards/KL_reward/mean": -0.007486949674785137, + "rewards/KL_reward/std": 0.0073571400716900826, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.09576274454593658, + "rewards/angle_reward/std": 0.6821129322052002, + "rewards/thinking_verbosity_reward/mean": -1.3155295848846436, + "rewards/thinking_verbosity_reward/std": 0.39976224303245544, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 622.8515625, + "epoch": 0.32086167800453513, + "grad_norm": 0.018422694876790047, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 35166341.0, + "rewards/KL_reward/mean": -0.008260250091552734, + "rewards/KL_reward/std": 0.00703958235681057, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.0013125715777277946, + "rewards/angle_reward/std": 0.7073757648468018, + "rewards/thinking_verbosity_reward/mean": -1.2233890295028687, + "rewards/thinking_verbosity_reward/std": 0.24154826998710632, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 672.703125, + "epoch": 0.3219954648526077, + "grad_norm": 0.018485447391867638, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 35284727.0, + "rewards/KL_reward/mean": -0.00819874182343483, + "rewards/KL_reward/std": 0.007637142203748226, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.027087576687335968, + "rewards/angle_reward/std": 0.706924319267273, + "rewards/thinking_verbosity_reward/mean": -1.2631499767303467, + "rewards/thinking_verbosity_reward/std": 0.2903617322444916, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 656.4921875, + "epoch": 0.3231292517006803, + "grad_norm": 0.01780383102595806, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 35401038.0, + "rewards/KL_reward/mean": -0.006157747469842434, + "rewards/KL_reward/std": 0.006943391170352697, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.057412780821323395, + "rewards/angle_reward/std": 0.7159155011177063, + "rewards/thinking_verbosity_reward/mean": -1.257150411605835, + "rewards/thinking_verbosity_reward/std": 0.24236872792243958, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 666.65625, + "epoch": 0.3242630385487528, + "grad_norm": 0.01581946201622486, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 35518610.0, + "rewards/KL_reward/mean": -0.005965085234493017, + "rewards/KL_reward/std": 0.0054606543853878975, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21220162510871887, + "rewards/angle_reward/mean": 0.0004185568541288376, + "rewards/angle_reward/std": 0.6976437568664551, + "rewards/thinking_verbosity_reward/mean": -1.2735779285430908, + "rewards/thinking_verbosity_reward/std": 0.20594365894794464, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 616.828125, + "epoch": 0.3253968253968254, + "grad_norm": 0.018370624631643295, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 35629364.0, + "rewards/KL_reward/mean": -0.009418688714504242, + "rewards/KL_reward/std": 0.00715947849676013, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": 0.003683575429022312, + "rewards/angle_reward/std": 0.6758346557617188, + "rewards/thinking_verbosity_reward/mean": -1.221147894859314, + "rewards/thinking_verbosity_reward/std": 0.2207469344139099, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 645.171875, + "epoch": 0.32653061224489793, + "grad_norm": 0.02141338214278221, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 35743922.0, + "rewards/KL_reward/mean": -0.007814271375536919, + "rewards/KL_reward/std": 0.0062730573117733, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.03710462898015976, + "rewards/angle_reward/std": 0.7213236093521118, + "rewards/thinking_verbosity_reward/mean": -1.2461848258972168, + "rewards/thinking_verbosity_reward/std": 0.24051284790039062, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 610.546875, + "epoch": 0.3276643990929705, + "grad_norm": 0.019021552056074142, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 35853616.0, + "rewards/KL_reward/mean": -0.008106883615255356, + "rewards/KL_reward/std": 0.00803397037088871, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "rewards/angle_reward/mean": -0.014590539038181305, + "rewards/angle_reward/std": 0.7261592149734497, + "rewards/thinking_verbosity_reward/mean": -1.2160003185272217, + "rewards/thinking_verbosity_reward/std": 0.213370218873024, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 633.015625, + "epoch": 0.3287981859410431, + "grad_norm": 0.021869687363505363, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 35966794.0, + "rewards/KL_reward/mean": -0.007401672657579184, + "rewards/KL_reward/std": 0.006323895882815123, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.03046243079006672, + "rewards/angle_reward/std": 0.6955752968788147, + "rewards/thinking_verbosity_reward/mean": -1.2145500183105469, + "rewards/thinking_verbosity_reward/std": 0.32519689202308655, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 667.59375, + "epoch": 0.3299319727891156, + "grad_norm": 0.01596551202237606, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 36084022.0, + "rewards/KL_reward/mean": -0.007860099896788597, + "rewards/KL_reward/std": 0.00665412750095129, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.029873479157686234, + "rewards/angle_reward/std": 0.707763671875, + "rewards/thinking_verbosity_reward/mean": -1.268153429031372, + "rewards/thinking_verbosity_reward/std": 0.24226154386997223, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 641.1484375, + "epoch": 0.3310657596371882, + "grad_norm": 0.019610509276390076, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 36198417.0, + "rewards/KL_reward/mean": -0.0062166606076061726, + "rewards/KL_reward/std": 0.006279591470956802, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.06403964012861252, + "rewards/angle_reward/std": 0.7216745615005493, + "rewards/thinking_verbosity_reward/mean": -1.248220682144165, + "rewards/thinking_verbosity_reward/std": 0.20642216503620148, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 673.6484375, + "epoch": 0.3321995464852608, + "grad_norm": 0.016013026237487793, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 36316948.0, + "rewards/KL_reward/mean": -0.008568549528717995, + "rewards/KL_reward/std": 0.007496171165257692, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.035736314952373505, + "rewards/angle_reward/std": 0.7004334926605225, + "rewards/thinking_verbosity_reward/mean": -1.2675621509552002, + "rewards/thinking_verbosity_reward/std": 0.2747315466403961, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 657.9921875, + "epoch": 0.3333333333333333, + "grad_norm": 0.020345015451312065, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 36432819.0, + "rewards/KL_reward/mean": -0.008337104693055153, + "rewards/KL_reward/std": 0.007409153506159782, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.09618128091096878, + "rewards/angle_reward/std": 0.7162677049636841, + "rewards/thinking_verbosity_reward/mean": -1.2603285312652588, + "rewards/thinking_verbosity_reward/std": 0.23327921330928802, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 665.0859375, + "epoch": 0.3344671201814059, + "grad_norm": 0.019598834216594696, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 36549526.0, + "rewards/KL_reward/mean": -0.005868074018508196, + "rewards/KL_reward/std": 0.005221591331064701, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "rewards/angle_reward/mean": -0.03156152367591858, + "rewards/angle_reward/std": 0.7383613586425781, + "rewards/thinking_verbosity_reward/mean": -1.2608528137207031, + "rewards/thinking_verbosity_reward/std": 0.266458660364151, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 680.3125, + "epoch": 0.3356009070294785, + "grad_norm": 0.01849890686571598, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 36667966.0, + "rewards/KL_reward/mean": -0.00805110577493906, + "rewards/KL_reward/std": 0.007103241514414549, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21220162510871887, + "rewards/angle_reward/mean": 0.042491309344768524, + "rewards/angle_reward/std": 0.699578046798706, + "rewards/thinking_verbosity_reward/mean": -1.2727693319320679, + "rewards/thinking_verbosity_reward/std": 0.280956506729126, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 635.7734375, + "epoch": 0.336734693877551, + "grad_norm": 0.020537061616778374, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 36781049.0, + "rewards/KL_reward/mean": -0.007871702313423157, + "rewards/KL_reward/std": 0.007687455043196678, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.09363602101802826, + "rewards/angle_reward/std": 0.7016919255256653, + "rewards/thinking_verbosity_reward/mean": -1.2297581434249878, + "rewards/thinking_verbosity_reward/std": 0.274255633354187, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 693.84375, + "epoch": 0.3378684807256236, + "grad_norm": 0.015424701385200024, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 36902181.0, + "rewards/KL_reward/mean": -0.0069398339837789536, + "rewards/KL_reward/std": 0.005675792694091797, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.010185010731220245, + "rewards/angle_reward/std": 0.7017130851745605, + "rewards/thinking_verbosity_reward/mean": -1.2912731170654297, + "rewards/thinking_verbosity_reward/std": 0.25536859035491943, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 662.3046875, + "epoch": 0.33900226757369617, + "grad_norm": 0.01764794811606407, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37018204.0, + "rewards/KL_reward/mean": -0.007857094518840313, + "rewards/KL_reward/std": 0.007627793122082949, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.017138749361038208, + "rewards/angle_reward/std": 0.7161518335342407, + "rewards/thinking_verbosity_reward/mean": -1.2537500858306885, + "rewards/thinking_verbosity_reward/std": 0.28648269176483154, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 784.5390625, + "epoch": 0.3401360544217687, + "grad_norm": 0.01640402339398861, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37150393.0, + "rewards/KL_reward/mean": -0.006395349279046059, + "rewards/KL_reward/std": 0.006379930768162012, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.049736715853214264, + "rewards/angle_reward/std": 0.7273575067520142, + "rewards/thinking_verbosity_reward/mean": -1.3761987686157227, + "rewards/thinking_verbosity_reward/std": 0.25580403208732605, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 598.3515625, + "epoch": 0.3412698412698413, + "grad_norm": 0.018060341477394104, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 37257910.0, + "rewards/KL_reward/mean": -0.007060700561851263, + "rewards/KL_reward/std": 0.006098188925534487, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": -0.007485490292310715, + "rewards/angle_reward/std": 0.6995981931686401, + "rewards/thinking_verbosity_reward/mean": -1.2047525644302368, + "rewards/thinking_verbosity_reward/std": 0.2056220918893814, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 517.0390625, + "epoch": 0.3424036281179138, + "grad_norm": 0.022523924708366394, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37355931.0, + "rewards/KL_reward/mean": -0.009995004162192345, + "rewards/KL_reward/std": 0.00978040136396885, + "rewards/accuracy_reward/mean": 0.28125, + "rewards/accuracy_reward/std": 0.4513758420944214, + "rewards/angle_reward/mean": 0.062474675476551056, + "rewards/angle_reward/std": 0.716263473033905, + "rewards/thinking_verbosity_reward/mean": -1.1115931272506714, + "rewards/thinking_verbosity_reward/std": 0.23419605195522308, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 586.640625, + "epoch": 0.3435374149659864, + "grad_norm": 0.02103191800415516, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 37463093.0, + "rewards/KL_reward/mean": -0.00888234656304121, + "rewards/KL_reward/std": 0.008172755129635334, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": 0.050526078790426254, + "rewards/angle_reward/std": 0.7193748950958252, + "rewards/thinking_verbosity_reward/mean": -1.1831541061401367, + "rewards/thinking_verbosity_reward/std": 0.254467636346817, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 565.203125, + "epoch": 0.34467120181405897, + "grad_norm": 0.019218802452087402, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37566719.0, + "rewards/KL_reward/mean": -0.009941613301634789, + "rewards/KL_reward/std": 0.007637757807970047, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.022061128169298172, + "rewards/angle_reward/std": 0.708019495010376, + "rewards/thinking_verbosity_reward/mean": -1.1639440059661865, + "rewards/thinking_verbosity_reward/std": 0.23694221675395966, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 669.921875, + "epoch": 0.3458049886621315, + "grad_norm": 0.019459662958979607, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37684597.0, + "rewards/KL_reward/mean": -0.009102117270231247, + "rewards/KL_reward/std": 0.008494559675455093, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.024678537622094154, + "rewards/angle_reward/std": 0.7370985746383667, + "rewards/thinking_verbosity_reward/mean": -1.2622956037521362, + "rewards/thinking_verbosity_reward/std": 0.28195133805274963, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 692.65625, + "epoch": 0.3469387755102041, + "grad_norm": 0.019493240863084793, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37805657.0, + "rewards/KL_reward/mean": -0.008165314793586731, + "rewards/KL_reward/std": 0.007228944916278124, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": 0.062268540263175964, + "rewards/angle_reward/std": 0.7373337149620056, + "rewards/thinking_verbosity_reward/mean": -1.2855594158172607, + "rewards/thinking_verbosity_reward/std": 0.2775971591472626, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 663.1640625, + "epoch": 0.34807256235827666, + "grad_norm": 0.016767730936408043, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 37922774.0, + "rewards/KL_reward/mean": -0.008340677246451378, + "rewards/KL_reward/std": 0.007052331697195768, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": 0.062498558312654495, + "rewards/angle_reward/std": 0.6978163123130798, + "rewards/thinking_verbosity_reward/mean": -1.2607831954956055, + "rewards/thinking_verbosity_reward/std": 0.2573254406452179, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 635.0546875, + "epoch": 0.3492063492063492, + "grad_norm": 0.016742372885346413, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38035365.0, + "rewards/KL_reward/mean": -0.010181295685470104, + "rewards/KL_reward/std": 0.007941014133393764, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21220162510871887, + "rewards/angle_reward/mean": 0.042019449174404144, + "rewards/angle_reward/std": 0.6886358261108398, + "rewards/thinking_verbosity_reward/mean": -1.2268184423446655, + "rewards/thinking_verbosity_reward/std": 0.2841179370880127, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 658.5078125, + "epoch": 0.35034013605442177, + "grad_norm": 0.016526585444808006, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 38151198.0, + "rewards/KL_reward/mean": -0.0074892486445605755, + "rewards/KL_reward/std": 0.0055892588570714, + "rewards/accuracy_reward/mean": 0.296875, + "rewards/accuracy_reward/std": 0.45867621898651123, + "rewards/angle_reward/mean": -0.026805002242326736, + "rewards/angle_reward/std": 0.700246274471283, + "rewards/thinking_verbosity_reward/mean": -1.2586907148361206, + "rewards/thinking_verbosity_reward/std": 0.24481716752052307, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 683.9453125, + "epoch": 0.35147392290249435, + "grad_norm": 0.0192258283495903, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 38270863.0, + "rewards/KL_reward/mean": -0.009363025426864624, + "rewards/KL_reward/std": 0.007326322607696056, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.06473156809806824, + "rewards/angle_reward/std": 0.7032992243766785, + "rewards/thinking_verbosity_reward/mean": -1.2719552516937256, + "rewards/thinking_verbosity_reward/std": 0.3002309501171112, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 619.0703125, + "epoch": 0.3526077097505669, + "grad_norm": 0.016836460679769516, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 38381744.0, + "rewards/KL_reward/mean": -0.010077145881950855, + "rewards/KL_reward/std": 0.007469221018254757, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.040938571095466614, + "rewards/angle_reward/std": 0.7000210285186768, + "rewards/thinking_verbosity_reward/mean": -1.2291001081466675, + "rewards/thinking_verbosity_reward/std": 0.1862824708223343, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 642.453125, + "epoch": 0.35374149659863946, + "grad_norm": 0.017571302130818367, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 38496154.0, + "rewards/KL_reward/mean": -0.008081937208771706, + "rewards/KL_reward/std": 0.006963278166949749, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.05039401724934578, + "rewards/angle_reward/std": 0.7186501622200012, + "rewards/thinking_verbosity_reward/mean": -1.2407217025756836, + "rewards/thinking_verbosity_reward/std": 0.254418283700943, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 714.4375, + "epoch": 0.35487528344671204, + "grad_norm": 0.017241306602954865, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38619218.0, + "rewards/KL_reward/mean": -0.008873986080288887, + "rewards/KL_reward/std": 0.008177191950380802, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.07794732600450516, + "rewards/angle_reward/std": 0.720664918422699, + "rewards/thinking_verbosity_reward/mean": -1.3019829988479614, + "rewards/thinking_verbosity_reward/std": 0.29851409792900085, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 658.84375, + "epoch": 0.35600907029478457, + "grad_norm": 0.018195878714323044, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38735758.0, + "rewards/KL_reward/mean": -0.008417133241891861, + "rewards/KL_reward/std": 0.006822847295552492, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.03660423681139946, + "rewards/angle_reward/std": 0.7281025648117065, + "rewards/thinking_verbosity_reward/mean": -1.2603774070739746, + "rewards/thinking_verbosity_reward/std": 0.2376958131790161, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 695.375, + "epoch": 0.35714285714285715, + "grad_norm": 0.01648283377289772, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38856750.0, + "rewards/KL_reward/mean": -0.008553637191653252, + "rewards/KL_reward/std": 0.007821978069841862, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.021729689091444016, + "rewards/angle_reward/std": 0.6876324415206909, + "rewards/thinking_verbosity_reward/mean": -1.2965296506881714, + "rewards/thinking_verbosity_reward/std": 0.2352479249238968, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 566.4609375, + "epoch": 0.35827664399092973, + "grad_norm": 0.02313867025077343, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 38961017.0, + "rewards/KL_reward/mean": -0.012008454650640488, + "rewards/KL_reward/std": 0.008895636536180973, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.07547923922538757, + "rewards/angle_reward/std": 0.7204188704490662, + "rewards/thinking_verbosity_reward/mean": -1.1683481931686401, + "rewards/thinking_verbosity_reward/std": 0.22139836847782135, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 652.84375, + "epoch": 0.35941043083900226, + "grad_norm": 0.014823821373283863, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 39075997.0, + "rewards/KL_reward/mean": -0.008365944027900696, + "rewards/KL_reward/std": 0.00788775086402893, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.023123951628804207, + "rewards/angle_reward/std": 0.7013049125671387, + "rewards/thinking_verbosity_reward/mean": -1.2614336013793945, + "rewards/thinking_verbosity_reward/std": 0.19676585495471954, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 653.203125, + "epoch": 0.36054421768707484, + "grad_norm": 0.017812160775065422, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 39191943.0, + "rewards/KL_reward/mean": -0.00916835106909275, + "rewards/KL_reward/std": 0.008945772424340248, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.0597342923283577, + "rewards/angle_reward/std": 0.7158983945846558, + "rewards/thinking_verbosity_reward/mean": -1.2495640516281128, + "rewards/thinking_verbosity_reward/std": 0.2638596296310425, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 629.8359375, + "epoch": 0.36167800453514737, + "grad_norm": 0.016286782920360565, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 39304482.0, + "rewards/KL_reward/mean": -0.010274011641740799, + "rewards/KL_reward/std": 0.008000952191650867, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22826264798641205, + "rewards/angle_reward/mean": -0.008843816816806793, + "rewards/angle_reward/std": 0.7008348107337952, + "rewards/thinking_verbosity_reward/mean": -1.2296520471572876, + "rewards/thinking_verbosity_reward/std": 0.2459651231765747, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 666.015625, + "epoch": 0.36281179138321995, + "grad_norm": 0.017830902710556984, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 39422132.0, + "rewards/KL_reward/mean": -0.009082363918423653, + "rewards/KL_reward/std": 0.008023828268051147, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.07964855432510376, + "rewards/angle_reward/std": 0.7010746598243713, + "rewards/thinking_verbosity_reward/mean": -1.2581864595413208, + "rewards/thinking_verbosity_reward/std": 0.28311049938201904, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 690.1640625, + "epoch": 0.36394557823129253, + "grad_norm": 0.016204338520765305, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 39541873.0, + "rewards/KL_reward/mean": -0.00774435605853796, + "rewards/KL_reward/std": 0.007772011682391167, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.020165707916021347, + "rewards/angle_reward/std": 0.7051674127578735, + "rewards/thinking_verbosity_reward/mean": -1.2649781703948975, + "rewards/thinking_verbosity_reward/std": 0.35163623094558716, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 714.8828125, + "epoch": 0.36507936507936506, + "grad_norm": 0.01743881031870842, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 39664826.0, + "rewards/KL_reward/mean": -0.008616095408797264, + "rewards/KL_reward/std": 0.007796288467943668, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.0778302252292633, + "rewards/angle_reward/std": 0.711172878742218, + "rewards/thinking_verbosity_reward/mean": -1.306434988975525, + "rewards/thinking_verbosity_reward/std": 0.2801944315433502, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 636.0078125, + "epoch": 0.36621315192743764, + "grad_norm": 0.021684397011995316, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 39778667.0, + "rewards/KL_reward/mean": -0.011362764984369278, + "rewards/KL_reward/std": 0.00869831908494234, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.029594052582979202, + "rewards/angle_reward/std": 0.7153961658477783, + "rewards/thinking_verbosity_reward/mean": -1.225907325744629, + "rewards/thinking_verbosity_reward/std": 0.2921258807182312, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 598.90625, + "epoch": 0.3673469387755102, + "grad_norm": 0.01639612577855587, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 39887495.0, + "rewards/KL_reward/mean": -0.00996290985494852, + "rewards/KL_reward/std": 0.008893659338355064, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.04011102393269539, + "rewards/angle_reward/std": 0.6990154385566711, + "rewards/thinking_verbosity_reward/mean": -1.1960594654083252, + "rewards/thinking_verbosity_reward/std": 0.2544650733470917, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 594.0078125, + "epoch": 0.36848072562358275, + "grad_norm": 0.020544040948152542, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 39995504.0, + "rewards/KL_reward/mean": -0.009360745549201965, + "rewards/KL_reward/std": 0.0074703386053442955, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.09173193573951721, + "rewards/angle_reward/std": 0.7248408794403076, + "rewards/thinking_verbosity_reward/mean": -1.195133924484253, + "rewards/thinking_verbosity_reward/std": 0.23354220390319824, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 615.875, + "epoch": 0.36961451247165533, + "grad_norm": 0.019898978993296623, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 40106392.0, + "rewards/KL_reward/mean": -0.009691888466477394, + "rewards/KL_reward/std": 0.007954951375722885, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.08430376648902893, + "rewards/angle_reward/std": 0.7162787914276123, + "rewards/thinking_verbosity_reward/mean": -1.2271376848220825, + "rewards/thinking_verbosity_reward/std": 0.17739422619342804, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 569.078125, + "epoch": 0.3707482993197279, + "grad_norm": 0.019553160294890404, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 40210906.0, + "rewards/KL_reward/mean": -0.011003728024661541, + "rewards/KL_reward/std": 0.009004923515021801, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": -0.03519893437623978, + "rewards/angle_reward/std": 0.71088045835495, + "rewards/thinking_verbosity_reward/mean": -1.163379430770874, + "rewards/thinking_verbosity_reward/std": 0.25940099358558655, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 581.140625, + "epoch": 0.37188208616780044, + "grad_norm": 0.019406193867325783, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 40317180.0, + "rewards/KL_reward/mean": -0.009035211056470871, + "rewards/KL_reward/std": 0.0066587477922439575, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": -0.06995624303817749, + "rewards/angle_reward/std": 0.7030081748962402, + "rewards/thinking_verbosity_reward/mean": -1.184302568435669, + "rewards/thinking_verbosity_reward/std": 0.21930840611457825, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 608.3671875, + "epoch": 0.373015873015873, + "grad_norm": 0.017711546272039413, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 40426515.0, + "rewards/KL_reward/mean": -0.009203861467540264, + "rewards/KL_reward/std": 0.007614146452397108, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.0009618657641112804, + "rewards/angle_reward/std": 0.6823228001594543, + "rewards/thinking_verbosity_reward/mean": -1.21611487865448, + "rewards/thinking_verbosity_reward/std": 0.19933529198169708, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 628.4765625, + "epoch": 0.3741496598639456, + "grad_norm": 0.017541462555527687, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 40538904.0, + "rewards/KL_reward/mean": -0.009096905589103699, + "rewards/KL_reward/std": 0.009053525514900684, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": 0.00803598016500473, + "rewards/angle_reward/std": 0.7005848288536072, + "rewards/thinking_verbosity_reward/mean": -1.2112784385681152, + "rewards/thinking_verbosity_reward/std": 0.319836288690567, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 670.0078125, + "epoch": 0.37528344671201813, + "grad_norm": 0.02290179766714573, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 40656385.0, + "rewards/KL_reward/mean": -0.009822444058954716, + "rewards/KL_reward/std": 0.0084287254139781, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.07108981907367706, + "rewards/angle_reward/std": 0.72761070728302, + "rewards/thinking_verbosity_reward/mean": -1.272742509841919, + "rewards/thinking_verbosity_reward/std": 0.2303491234779358, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 605.453125, + "epoch": 0.3764172335600907, + "grad_norm": 0.018231753259897232, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 40765707.0, + "rewards/KL_reward/mean": -0.010320719331502914, + "rewards/KL_reward/std": 0.007579497992992401, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": 0.0481044203042984, + "rewards/angle_reward/std": 0.7102060317993164, + "rewards/thinking_verbosity_reward/mean": -1.2064979076385498, + "rewards/thinking_verbosity_reward/std": 0.2365427017211914, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 585.71875, + "epoch": 0.37755102040816324, + "grad_norm": 0.01733916811645031, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 40871983.0, + "rewards/KL_reward/mean": -0.009395333006978035, + "rewards/KL_reward/std": 0.00850164145231247, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": -0.09402258694171906, + "rewards/angle_reward/std": 0.6644055247306824, + "rewards/thinking_verbosity_reward/mean": -1.1958613395690918, + "rewards/thinking_verbosity_reward/std": 0.17861491441726685, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 644.5390625, + "epoch": 0.3786848072562358, + "grad_norm": 0.015585731714963913, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 40986700.0, + "rewards/KL_reward/mean": -0.011141793802380562, + "rewards/KL_reward/std": 0.007814460434019566, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.050050366669893265, + "rewards/angle_reward/std": 0.6957428455352783, + "rewards/thinking_verbosity_reward/mean": -1.2409121990203857, + "rewards/thinking_verbosity_reward/std": 0.263643354177475, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 552.734375, + "epoch": 0.3798185941043084, + "grad_norm": 0.01960836723446846, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41089434.0, + "rewards/KL_reward/mean": -0.014167695306241512, + "rewards/KL_reward/std": 0.010362173430621624, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": 0.005596708040684462, + "rewards/angle_reward/std": 0.7121492624282837, + "rewards/thinking_verbosity_reward/mean": -1.151566743850708, + "rewards/thinking_verbosity_reward/std": 0.23162296414375305, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 580.59375, + "epoch": 0.38095238095238093, + "grad_norm": 0.018973182886838913, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 41194886.0, + "rewards/KL_reward/mean": -0.011608713306486607, + "rewards/KL_reward/std": 0.01016836054623127, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.09077536314725876, + "rewards/angle_reward/std": 0.6637977957725525, + "rewards/thinking_verbosity_reward/mean": -1.1720116138458252, + "rewards/thinking_verbosity_reward/std": 0.27554944157600403, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 548.703125, + "epoch": 0.3820861678004535, + "grad_norm": 0.020832985639572144, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41297000.0, + "rewards/KL_reward/mean": -0.01433346327394247, + "rewards/KL_reward/std": 0.010334284976124763, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.026250137016177177, + "rewards/angle_reward/std": 0.7008341550827026, + "rewards/thinking_verbosity_reward/mean": -1.149053692817688, + "rewards/thinking_verbosity_reward/std": 0.22190140187740326, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 651.7421875, + "epoch": 0.3832199546485261, + "grad_norm": 0.016582710668444633, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 41412367.0, + "rewards/KL_reward/mean": -0.011677080765366554, + "rewards/KL_reward/std": 0.007517748977988958, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": 0.07228608429431915, + "rewards/angle_reward/std": 0.6654783487319946, + "rewards/thinking_verbosity_reward/mean": -1.2558412551879883, + "rewards/thinking_verbosity_reward/std": 0.2238079309463501, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 473.3828125, + "epoch": 0.3843537414965986, + "grad_norm": 0.027373218908905983, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41505200.0, + "rewards/KL_reward/mean": -0.014650771394371986, + "rewards/KL_reward/std": 0.010913332924246788, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.05021342635154724, + "rewards/angle_reward/std": 0.6841585040092468, + "rewards/thinking_verbosity_reward/mean": -1.0703449249267578, + "rewards/thinking_verbosity_reward/std": 0.18872351944446564, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 538.09375, + "epoch": 0.3854875283446712, + "grad_norm": 0.022517004981637, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 41606260.0, + "rewards/KL_reward/mean": -0.013644316233694553, + "rewards/KL_reward/std": 0.009786357171833515, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": 0.01600516028702259, + "rewards/angle_reward/std": 0.6811925172805786, + "rewards/thinking_verbosity_reward/mean": -1.1354541778564453, + "rewards/thinking_verbosity_reward/std": 0.2319503128528595, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 647.5234375, + "epoch": 0.3866213151927438, + "grad_norm": 0.016684694215655327, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 41720591.0, + "rewards/KL_reward/mean": -0.010870617814362049, + "rewards/KL_reward/std": 0.009292100556194782, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.0134214386343956, + "rewards/angle_reward/std": 0.6902464032173157, + "rewards/thinking_verbosity_reward/mean": -1.2468688488006592, + "rewards/thinking_verbosity_reward/std": 0.24922247231006622, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 565.109375, + "epoch": 0.3877551020408163, + "grad_norm": 0.01767166331410408, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41825269.0, + "rewards/KL_reward/mean": -0.011611821129918098, + "rewards/KL_reward/std": 0.009323826991021633, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.0860278457403183, + "rewards/angle_reward/std": 0.6994227170944214, + "rewards/thinking_verbosity_reward/mean": -1.1734181642532349, + "rewards/thinking_verbosity_reward/std": 0.18337365984916687, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 695.015625, + "epoch": 0.3888888888888889, + "grad_norm": 0.016172772273421288, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 41946431.0, + "rewards/KL_reward/mean": -0.0106268972158432, + "rewards/KL_reward/std": 0.00812254287302494, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.055921170860528946, + "rewards/angle_reward/std": 0.7074174880981445, + "rewards/thinking_verbosity_reward/mean": -1.2934153079986572, + "rewards/thinking_verbosity_reward/std": 0.2502100169658661, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 492.84375, + "epoch": 0.3900226757369615, + "grad_norm": 0.022642863914370537, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 42041259.0, + "rewards/KL_reward/mean": -0.013332553207874298, + "rewards/KL_reward/std": 0.009947570040822029, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.026306793093681335, + "rewards/angle_reward/std": 0.7171652317047119, + "rewards/thinking_verbosity_reward/mean": -1.0878281593322754, + "rewards/thinking_verbosity_reward/std": 0.21583305299282074, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 582.0078125, + "epoch": 0.391156462585034, + "grad_norm": 0.021043403074145317, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42147996.0, + "rewards/KL_reward/mean": -0.011846832931041718, + "rewards/KL_reward/std": 0.009587462060153484, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "rewards/angle_reward/mean": -0.05014696717262268, + "rewards/angle_reward/std": 0.7022067904472351, + "rewards/thinking_verbosity_reward/mean": -1.1845569610595703, + "rewards/thinking_verbosity_reward/std": 0.22296461462974548, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 593.0703125, + "epoch": 0.3922902494331066, + "grad_norm": 0.018539853394031525, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 42256085.0, + "rewards/KL_reward/mean": -0.01160618755966425, + "rewards/KL_reward/std": 0.009447838179767132, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": -0.026164177805185318, + "rewards/angle_reward/std": 0.7124016284942627, + "rewards/thinking_verbosity_reward/mean": -1.19658625125885, + "rewards/thinking_verbosity_reward/std": 0.2206772416830063, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 544.828125, + "epoch": 0.3934240362811791, + "grad_norm": 0.02210315316915512, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 42358007.0, + "rewards/KL_reward/mean": -0.013557846657931805, + "rewards/KL_reward/std": 0.009934193454682827, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": -0.03684143349528313, + "rewards/angle_reward/std": 0.698218047618866, + "rewards/thinking_verbosity_reward/mean": -1.1469924449920654, + "rewards/thinking_verbosity_reward/std": 0.21053484082221985, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 622.9765625, + "epoch": 0.3945578231292517, + "grad_norm": 0.017409779131412506, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42469940.0, + "rewards/KL_reward/mean": -0.010706034488976002, + "rewards/KL_reward/std": 0.009406552650034428, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.03276388719677925, + "rewards/angle_reward/std": 0.6755772829055786, + "rewards/thinking_verbosity_reward/mean": -1.2311208248138428, + "rewards/thinking_verbosity_reward/std": 0.1989777535200119, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 641.8515625, + "epoch": 0.3956916099773243, + "grad_norm": 0.016742389649152756, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42583889.0, + "rewards/KL_reward/mean": -0.011015353724360466, + "rewards/KL_reward/std": 0.008712667971849442, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": 0.008286483585834503, + "rewards/angle_reward/std": 0.7040897011756897, + "rewards/thinking_verbosity_reward/mean": -1.2278902530670166, + "rewards/thinking_verbosity_reward/std": 0.30847659707069397, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 568.109375, + "epoch": 0.3968253968253968, + "grad_norm": 0.021108614280819893, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42688471.0, + "rewards/KL_reward/mean": -0.013645791448652744, + "rewards/KL_reward/std": 0.011249223724007607, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": 0.05859003961086273, + "rewards/angle_reward/std": 0.7381419539451599, + "rewards/thinking_verbosity_reward/mean": -1.1669996976852417, + "rewards/thinking_verbosity_reward/std": 0.23741234838962555, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 553.9765625, + "epoch": 0.3979591836734694, + "grad_norm": 0.020242752507328987, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42791812.0, + "rewards/KL_reward/mean": -0.013703307136893272, + "rewards/KL_reward/std": 0.010288462042808533, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.01580832153558731, + "rewards/angle_reward/std": 0.7156597971916199, + "rewards/thinking_verbosity_reward/mean": -1.15157949924469, + "rewards/thinking_verbosity_reward/std": 0.23826247453689575, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 574.71875, + "epoch": 0.39909297052154197, + "grad_norm": 0.021016456186771393, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 42897768.0, + "rewards/KL_reward/mean": -0.011165348812937737, + "rewards/KL_reward/std": 0.0091776167973876, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "rewards/angle_reward/mean": -0.014624292030930519, + "rewards/angle_reward/std": 0.6658822894096375, + "rewards/thinking_verbosity_reward/mean": -1.1669721603393555, + "rewards/thinking_verbosity_reward/std": 0.2702290713787079, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 604.578125, + "epoch": 0.4002267573696145, + "grad_norm": 0.01961846835911274, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43007306.0, + "rewards/KL_reward/mean": -0.01148420199751854, + "rewards/KL_reward/std": 0.010005916468799114, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.03877856582403183, + "rewards/angle_reward/std": 0.6896043419837952, + "rewards/thinking_verbosity_reward/mean": -1.1999263763427734, + "rewards/thinking_verbosity_reward/std": 0.26386937499046326, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 608.1640625, + "epoch": 0.4013605442176871, + "grad_norm": 0.017984895035624504, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43117167.0, + "rewards/KL_reward/mean": -0.012511475943028927, + "rewards/KL_reward/std": 0.008827902376651764, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": 0.01137047354131937, + "rewards/angle_reward/std": 0.7264191508293152, + "rewards/thinking_verbosity_reward/mean": -1.2140936851501465, + "rewards/thinking_verbosity_reward/std": 0.21022173762321472, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 596.3125, + "epoch": 0.40249433106575966, + "grad_norm": 0.01846635527908802, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 43225463.0, + "rewards/KL_reward/mean": -0.011512024328112602, + "rewards/KL_reward/std": 0.009280053898692131, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.049537211656570435, + "rewards/angle_reward/std": 0.6839423179626465, + "rewards/thinking_verbosity_reward/mean": -1.201066017150879, + "rewards/thinking_verbosity_reward/std": 0.2146666944026947, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 669.7578125, + "epoch": 0.4036281179138322, + "grad_norm": 0.022069066762924194, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43343104.0, + "rewards/KL_reward/mean": -0.013993291184306145, + "rewards/KL_reward/std": 0.012002113275229931, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": -0.0638950914144516, + "rewards/angle_reward/std": 0.7101727724075317, + "rewards/thinking_verbosity_reward/mean": -1.2499220371246338, + "rewards/thinking_verbosity_reward/std": 0.33237338066101074, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 585.03125, + "epoch": 0.40476190476190477, + "grad_norm": 0.020424002781510353, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43449764.0, + "rewards/KL_reward/mean": -0.012592829763889313, + "rewards/KL_reward/std": 0.009561818093061447, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": 0.042757753282785416, + "rewards/angle_reward/std": 0.7115549445152283, + "rewards/thinking_verbosity_reward/mean": -1.182867407798767, + "rewards/thinking_verbosity_reward/std": 0.2477547973394394, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 574.359375, + "epoch": 0.40589569160997735, + "grad_norm": 0.0195750929415226, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43554914.0, + "rewards/KL_reward/mean": -0.01165720634162426, + "rewards/KL_reward/std": 0.008440484292805195, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.03374239802360535, + "rewards/angle_reward/std": 0.6871976256370544, + "rewards/thinking_verbosity_reward/mean": -1.1825599670410156, + "rewards/thinking_verbosity_reward/std": 0.18763259053230286, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 540.2109375, + "epoch": 0.4070294784580499, + "grad_norm": 0.024201730266213417, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43655309.0, + "rewards/KL_reward/mean": -0.013428442180156708, + "rewards/KL_reward/std": 0.009504460729658604, + "rewards/accuracy_reward/mean": 0.21875, + "rewards/accuracy_reward/std": 0.41502299904823303, + "rewards/angle_reward/mean": -0.03815825283527374, + "rewards/angle_reward/std": 0.7391625642776489, + "rewards/thinking_verbosity_reward/mean": -1.1418852806091309, + "rewards/thinking_verbosity_reward/std": 0.21088513731956482, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 513.296875, + "epoch": 0.40816326530612246, + "grad_norm": 0.018849622458219528, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43753107.0, + "rewards/KL_reward/mean": -0.010304594412446022, + "rewards/KL_reward/std": 0.007997327484190464, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": 0.05903025344014168, + "rewards/angle_reward/std": 0.7214637398719788, + "rewards/thinking_verbosity_reward/mean": -1.1183676719665527, + "rewards/thinking_verbosity_reward/std": 0.17387330532073975, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 622.234375, + "epoch": 0.409297052154195, + "grad_norm": 0.014960212633013725, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 43864649.0, + "rewards/KL_reward/mean": -0.011811970733106136, + "rewards/KL_reward/std": 0.009086758829653263, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": 0.01255982369184494, + "rewards/angle_reward/std": 0.6803118586540222, + "rewards/thinking_verbosity_reward/mean": -1.2231049537658691, + "rewards/thinking_verbosity_reward/std": 0.239854633808136, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 600.4765625, + "epoch": 0.41043083900226757, + "grad_norm": 0.019353918731212616, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 43973806.0, + "rewards/KL_reward/mean": -0.012659039348363876, + "rewards/KL_reward/std": 0.010857551358640194, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.044028282165527344, + "rewards/angle_reward/std": 0.7046893239021301, + "rewards/thinking_verbosity_reward/mean": -1.204061508178711, + "rewards/thinking_verbosity_reward/std": 0.22202034294605255, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 556.4140625, + "epoch": 0.41156462585034015, + "grad_norm": 0.01892830803990364, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 44077067.0, + "rewards/KL_reward/mean": -0.013601994141936302, + "rewards/KL_reward/std": 0.01063512172549963, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "rewards/angle_reward/mean": 0.010024391114711761, + "rewards/angle_reward/std": 0.7016712427139282, + "rewards/thinking_verbosity_reward/mean": -1.1596899032592773, + "rewards/thinking_verbosity_reward/std": 0.2097102701663971, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 555.5546875, + "epoch": 0.4126984126984127, + "grad_norm": 0.02198626659810543, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44180226.0, + "rewards/KL_reward/mean": -0.015893777832388878, + "rewards/KL_reward/std": 0.01015418116003275, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24301259219646454, + "rewards/angle_reward/mean": -0.002530481666326523, + "rewards/angle_reward/std": 0.6922826766967773, + "rewards/thinking_verbosity_reward/mean": -1.144598126411438, + "rewards/thinking_verbosity_reward/std": 0.27734825015068054, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 567.671875, + "epoch": 0.41383219954648526, + "grad_norm": 0.021529097110033035, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44284824.0, + "rewards/KL_reward/mean": -0.0151272714138031, + "rewards/KL_reward/std": 0.012720032595098019, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.036351703107357025, + "rewards/angle_reward/std": 0.7288860082626343, + "rewards/thinking_verbosity_reward/mean": -1.1615468263626099, + "rewards/thinking_verbosity_reward/std": 0.260768860578537, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 528.59375, + "epoch": 0.41496598639455784, + "grad_norm": 0.020824376493692398, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44383988.0, + "rewards/KL_reward/mean": -0.013930471614003181, + "rewards/KL_reward/std": 0.01095433160662651, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "rewards/angle_reward/mean": -0.005772540345788002, + "rewards/angle_reward/std": 0.6870887875556946, + "rewards/thinking_verbosity_reward/mean": -1.1287271976470947, + "rewards/thinking_verbosity_reward/std": 0.2128688246011734, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 526.5546875, + "epoch": 0.41609977324263037, + "grad_norm": 0.02206384763121605, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 44483259.0, + "rewards/KL_reward/mean": -0.016933314502239227, + "rewards/KL_reward/std": 0.012821149080991745, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.03346161171793938, + "rewards/angle_reward/std": 0.711763322353363, + "rewards/thinking_verbosity_reward/mean": -1.1266493797302246, + "rewards/thinking_verbosity_reward/std": 0.21184676885604858, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 602.3046875, + "epoch": 0.41723356009070295, + "grad_norm": 0.018928341567516327, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44592506.0, + "rewards/KL_reward/mean": -0.01630426198244095, + "rewards/KL_reward/std": 0.010932328179478645, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.194504976272583, + "rewards/angle_reward/mean": -0.0600726418197155, + "rewards/angle_reward/std": 0.7038162350654602, + "rewards/thinking_verbosity_reward/mean": -1.2096517086029053, + "rewards/thinking_verbosity_reward/std": 0.2007477581501007, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 612.390625, + "epoch": 0.41836734693877553, + "grad_norm": 0.020504910498857498, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44702980.0, + "rewards/KL_reward/mean": -0.011769358068704605, + "rewards/KL_reward/std": 0.00859046634286642, + "rewards/accuracy_reward/mean": 0.2421875, + "rewards/accuracy_reward/std": 0.4300905168056488, + "rewards/angle_reward/mean": 0.09859583526849747, + "rewards/angle_reward/std": 0.7153234481811523, + "rewards/thinking_verbosity_reward/mean": -1.204113245010376, + "rewards/thinking_verbosity_reward/std": 0.28140151500701904, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 618.8828125, + "epoch": 0.41950113378684806, + "grad_norm": 0.019282154738903046, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44814261.0, + "rewards/KL_reward/mean": -0.01377407368272543, + "rewards/KL_reward/std": 0.011624906212091446, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.00703779049217701, + "rewards/angle_reward/std": 0.6897515058517456, + "rewards/thinking_verbosity_reward/mean": -1.2140960693359375, + "rewards/thinking_verbosity_reward/std": 0.26689741015434265, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 604.5859375, + "epoch": 0.42063492063492064, + "grad_norm": 0.018426265567541122, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 44923256.0, + "rewards/KL_reward/mean": -0.011635626666247845, + "rewards/KL_reward/std": 0.007985219359397888, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": -0.03205763176083565, + "rewards/angle_reward/std": 0.7076945900917053, + "rewards/thinking_verbosity_reward/mean": -1.206263542175293, + "rewards/thinking_verbosity_reward/std": 0.23306098580360413, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 541.3125, + "epoch": 0.4217687074829932, + "grad_norm": 0.021884731948375702, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 45024336.0, + "rewards/KL_reward/mean": -0.016247743740677834, + "rewards/KL_reward/std": 0.011305822059512138, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.05057627707719803, + "rewards/angle_reward/std": 0.6928049325942993, + "rewards/thinking_verbosity_reward/mean": -1.1414589881896973, + "rewards/thinking_verbosity_reward/std": 0.21956755220890045, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 650.328125, + "epoch": 0.42290249433106575, + "grad_norm": 0.016121037304401398, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 45139514.0, + "rewards/KL_reward/mean": -0.013573609292507172, + "rewards/KL_reward/std": 0.01031778659671545, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.029925987124443054, + "rewards/angle_reward/std": 0.7024592161178589, + "rewards/thinking_verbosity_reward/mean": -1.2470903396606445, + "rewards/thinking_verbosity_reward/std": 0.26195836067199707, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 574.046875, + "epoch": 0.42403628117913833, + "grad_norm": 0.026229392737150192, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 45245304.0, + "rewards/KL_reward/mean": -0.01583711802959442, + "rewards/KL_reward/std": 0.011048772372305393, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.08974478393793106, + "rewards/angle_reward/std": 0.7353434562683105, + "rewards/thinking_verbosity_reward/mean": -1.1740953922271729, + "rewards/thinking_verbosity_reward/std": 0.2335452437400818, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 621.7421875, + "epoch": 0.42517006802721086, + "grad_norm": 0.02271593175828457, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 45356975.0, + "rewards/KL_reward/mean": -0.017855174839496613, + "rewards/KL_reward/std": 0.013386182487010956, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.06267501413822174, + "rewards/angle_reward/std": 0.7156962752342224, + "rewards/thinking_verbosity_reward/mean": -1.206615686416626, + "rewards/thinking_verbosity_reward/std": 0.3108856976032257, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 539.0625, + "epoch": 0.42630385487528344, + "grad_norm": 0.022404877468943596, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 45458191.0, + "rewards/KL_reward/mean": -0.017527278512716293, + "rewards/KL_reward/std": 0.01198546402156353, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": 0.0038344012573361397, + "rewards/angle_reward/std": 0.7199822664260864, + "rewards/thinking_verbosity_reward/mean": -1.1370865106582642, + "rewards/thinking_verbosity_reward/std": 0.2293068766593933, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 506.328125, + "epoch": 0.427437641723356, + "grad_norm": 0.021387537941336632, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 45555321.0, + "rewards/KL_reward/mean": -0.019968044012784958, + "rewards/KL_reward/std": 0.013661942444741726, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.01847529038786888, + "rewards/angle_reward/std": 0.7076072096824646, + "rewards/thinking_verbosity_reward/mean": -1.1016631126403809, + "rewards/thinking_verbosity_reward/std": 0.22371789813041687, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 556.765625, + "epoch": 0.42857142857142855, + "grad_norm": 0.021009325981140137, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 45657675.0, + "rewards/KL_reward/mean": -0.016950562596321106, + "rewards/KL_reward/std": 0.013891222886741161, + "rewards/accuracy_reward/mean": 0.2734375, + "rewards/accuracy_reward/std": 0.447474867105484, + "rewards/angle_reward/mean": 0.06334151327610016, + "rewards/angle_reward/std": 0.6955203413963318, + "rewards/thinking_verbosity_reward/mean": -1.1447854042053223, + "rewards/thinking_verbosity_reward/std": 0.28192561864852905, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 504.0703125, + "epoch": 0.42970521541950113, + "grad_norm": 0.025582019239664078, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 45753852.0, + "rewards/KL_reward/mean": -0.018711145967245102, + "rewards/KL_reward/std": 0.013790667988359928, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.112864650785923, + "rewards/angle_reward/std": 0.7187625765800476, + "rewards/thinking_verbosity_reward/mean": -1.0957250595092773, + "rewards/thinking_verbosity_reward/std": 0.23977471888065338, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 583.546875, + "epoch": 0.4308390022675737, + "grad_norm": 0.019505100324749947, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 45860730.0, + "rewards/KL_reward/mean": -0.0188748836517334, + "rewards/KL_reward/std": 0.013765603303909302, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.05972357839345932, + "rewards/angle_reward/std": 0.6861174702644348, + "rewards/thinking_verbosity_reward/mean": -1.185563564300537, + "rewards/thinking_verbosity_reward/std": 0.22624468803405762, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 699.2734375, + "epoch": 0.43197278911564624, + "grad_norm": 0.019939446821808815, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 45982389.0, + "rewards/KL_reward/mean": -0.014938908629119396, + "rewards/KL_reward/std": 0.011243962682783604, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": -0.10991691052913666, + "rewards/angle_reward/std": 0.7018421292304993, + "rewards/thinking_verbosity_reward/mean": -1.2750121355056763, + "rewards/thinking_verbosity_reward/std": 0.34786784648895264, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 579.0234375, + "epoch": 0.4331065759637188, + "grad_norm": 0.022386744618415833, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46088696.0, + "rewards/KL_reward/mean": -0.016111835837364197, + "rewards/KL_reward/std": 0.012026161886751652, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": 0.04101593792438507, + "rewards/angle_reward/std": 0.7212110161781311, + "rewards/thinking_verbosity_reward/mean": -1.1688454151153564, + "rewards/thinking_verbosity_reward/std": 0.2818171977996826, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 463.7734375, + "epoch": 0.4342403628117914, + "grad_norm": 0.02676510065793991, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46180715.0, + "rewards/KL_reward/mean": -0.02379307523369789, + "rewards/KL_reward/std": 0.014045041054487228, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22826264798641205, + "rewards/angle_reward/mean": -0.05956241860985756, + "rewards/angle_reward/std": 0.6922198534011841, + "rewards/thinking_verbosity_reward/mean": -1.0482418537139893, + "rewards/thinking_verbosity_reward/std": 0.2420331984758377, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 539.8125, + "epoch": 0.43537414965986393, + "grad_norm": 0.020799271762371063, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46281923.0, + "rewards/KL_reward/mean": -0.01960766687989235, + "rewards/KL_reward/std": 0.01396810170263052, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.034598276019096375, + "rewards/angle_reward/std": 0.6756842732429504, + "rewards/thinking_verbosity_reward/mean": -1.1341664791107178, + "rewards/thinking_verbosity_reward/std": 0.24729718267917633, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 635.9609375, + "epoch": 0.4365079365079365, + "grad_norm": 0.019486747682094574, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46394798.0, + "rewards/KL_reward/mean": -0.015526726841926575, + "rewards/KL_reward/std": 0.01156558282673359, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.010235173627734184, + "rewards/angle_reward/std": 0.7484339475631714, + "rewards/thinking_verbosity_reward/mean": -1.2333378791809082, + "rewards/thinking_verbosity_reward/std": 0.2584221661090851, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 525.90625, + "epoch": 0.4376417233560091, + "grad_norm": 0.023115647956728935, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 46493194.0, + "rewards/KL_reward/mean": -0.02045394666492939, + "rewards/KL_reward/std": 0.015044222585856915, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.005462062545120716, + "rewards/angle_reward/std": 0.709326446056366, + "rewards/thinking_verbosity_reward/mean": -1.1223537921905518, + "rewards/thinking_verbosity_reward/std": 0.23022116720676422, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 555.9375, + "epoch": 0.4387755102040816, + "grad_norm": 0.02062627114355564, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46596610.0, + "rewards/KL_reward/mean": -0.02183781936764717, + "rewards/KL_reward/std": 0.01346514094620943, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.194504976272583, + "rewards/angle_reward/mean": 0.08628194779157639, + "rewards/angle_reward/std": 0.6790797114372253, + "rewards/thinking_verbosity_reward/mean": -1.1447288990020752, + "rewards/thinking_verbosity_reward/std": 0.27846965193748474, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 644.078125, + "epoch": 0.4399092970521542, + "grad_norm": 0.01849028281867504, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 46711204.0, + "rewards/KL_reward/mean": -0.014653094112873077, + "rewards/KL_reward/std": 0.010434862226247787, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.004651002585887909, + "rewards/angle_reward/std": 0.6873242855072021, + "rewards/thinking_verbosity_reward/mean": -1.2303717136383057, + "rewards/thinking_verbosity_reward/std": 0.30763575434684753, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 503.7421875, + "epoch": 0.4410430839002268, + "grad_norm": 0.025818170979619026, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46807651.0, + "rewards/KL_reward/mean": -0.020328128710389137, + "rewards/KL_reward/std": 0.013701298274099827, + "rewards/accuracy_reward/mean": 0.2109375, + "rewards/accuracy_reward/std": 0.4095771610736847, + "rewards/angle_reward/mean": -0.047695379704236984, + "rewards/angle_reward/std": 0.695172905921936, + "rewards/thinking_verbosity_reward/mean": -1.0927703380584717, + "rewards/thinking_verbosity_reward/std": 0.25136125087738037, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 565.3203125, + "epoch": 0.4421768707482993, + "grad_norm": 0.022490637376904488, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 46911300.0, + "rewards/KL_reward/mean": -0.026106202974915504, + "rewards/KL_reward/std": 0.026732752099633217, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.03920355811715126, + "rewards/angle_reward/std": 0.6827701926231384, + "rewards/thinking_verbosity_reward/mean": -1.1563987731933594, + "rewards/thinking_verbosity_reward/std": 0.2722534239292145, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 564.9296875, + "epoch": 0.4433106575963719, + "grad_norm": 0.021665828302502632, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47015707.0, + "rewards/KL_reward/mean": -0.019879762083292007, + "rewards/KL_reward/std": 0.014529038220643997, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.03593013435602188, + "rewards/angle_reward/std": 0.7235757112503052, + "rewards/thinking_verbosity_reward/mean": -1.1605231761932373, + "rewards/thinking_verbosity_reward/std": 0.2520044147968292, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 421.9375, + "epoch": 0.4444444444444444, + "grad_norm": 0.025612734258174896, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 47101067.0, + "rewards/KL_reward/mean": -0.027379008010029793, + "rewards/KL_reward/std": 0.014442806132137775, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.08324871957302094, + "rewards/angle_reward/std": 0.6658397316932678, + "rewards/thinking_verbosity_reward/mean": -1.0087127685546875, + "rewards/thinking_verbosity_reward/std": 0.18739484250545502, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 507.890625, + "epoch": 0.445578231292517, + "grad_norm": 0.024070149287581444, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 47198117.0, + "rewards/KL_reward/mean": -0.020547538995742798, + "rewards/KL_reward/std": 0.014126886613667011, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": -0.005580000579357147, + "rewards/angle_reward/std": 0.7161521911621094, + "rewards/thinking_verbosity_reward/mean": -1.1085846424102783, + "rewards/thinking_verbosity_reward/std": 0.19633635878562927, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 480.65625, + "epoch": 0.4467120181405896, + "grad_norm": 0.021039605140686035, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47291625.0, + "rewards/KL_reward/mean": -0.021802667528390884, + "rewards/KL_reward/std": 0.01594499498605728, + "rewards/accuracy_reward/mean": 0.265625, + "rewards/accuracy_reward/std": 0.44340085983276367, + "rewards/angle_reward/mean": 0.029494380578398705, + "rewards/angle_reward/std": 0.7037698030471802, + "rewards/thinking_verbosity_reward/mean": -1.073141098022461, + "rewards/thinking_verbosity_reward/std": 0.2188214212656021, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 550.5, + "epoch": 0.4478458049886621, + "grad_norm": 0.02214093878865242, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47393793.0, + "rewards/KL_reward/mean": -0.021151121705770493, + "rewards/KL_reward/std": 0.014054550789296627, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "rewards/angle_reward/mean": 0.009490861557424068, + "rewards/angle_reward/std": 0.6957032680511475, + "rewards/thinking_verbosity_reward/mean": -1.1497143507003784, + "rewards/thinking_verbosity_reward/std": 0.22868293523788452, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 596.8984375, + "epoch": 0.4489795918367347, + "grad_norm": 0.02260676957666874, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47501172.0, + "rewards/KL_reward/mean": -0.021430671215057373, + "rewards/KL_reward/std": 0.01571296900510788, + "rewards/accuracy_reward/mean": 0.203125, + "rewards/accuracy_reward/std": 0.40390563011169434, + "rewards/angle_reward/mean": -0.0022510290145874023, + "rewards/angle_reward/std": 0.6822193264961243, + "rewards/thinking_verbosity_reward/mean": -1.1782336235046387, + "rewards/thinking_verbosity_reward/std": 0.3198300004005432, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 566.7890625, + "epoch": 0.4501133786848073, + "grad_norm": 0.020635398104786873, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 47605857.0, + "rewards/KL_reward/mean": -0.02311992645263672, + "rewards/KL_reward/std": 0.01627832092344761, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": 0.0059994058683514595, + "rewards/angle_reward/std": 0.7020658850669861, + "rewards/thinking_verbosity_reward/mean": -1.1678216457366943, + "rewards/thinking_verbosity_reward/std": 0.22597408294677734, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 532.828125, + "epoch": 0.4512471655328798, + "grad_norm": 0.02402258850634098, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 47706115.0, + "rewards/KL_reward/mean": -0.02359929494559765, + "rewards/KL_reward/std": 0.01530496682971716, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.03661785274744034, + "rewards/angle_reward/std": 0.7108094096183777, + "rewards/thinking_verbosity_reward/mean": -1.1365866661071777, + "rewards/thinking_verbosity_reward/std": 0.19503378868103027, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 635.484375, + "epoch": 0.4523809523809524, + "grad_norm": 0.02138921432197094, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 47818993.0, + "rewards/KL_reward/mean": -0.02280566468834877, + "rewards/KL_reward/std": 0.015408539213240147, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.01748541370034218, + "rewards/angle_reward/std": 0.6938936710357666, + "rewards/thinking_verbosity_reward/mean": -1.2105220556259155, + "rewards/thinking_verbosity_reward/std": 0.34896862506866455, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 522.609375, + "epoch": 0.45351473922902497, + "grad_norm": 0.023752223700284958, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 47917839.0, + "rewards/KL_reward/mean": -0.0232955701649189, + "rewards/KL_reward/std": 0.01624547690153122, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.0803392305970192, + "rewards/angle_reward/std": 0.7035095691680908, + "rewards/thinking_verbosity_reward/mean": -1.1072452068328857, + "rewards/thinking_verbosity_reward/std": 0.28043869137763977, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 535.421875, + "epoch": 0.4546485260770975, + "grad_norm": 0.022278867661952972, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 48018445.0, + "rewards/KL_reward/mean": -0.023161131888628006, + "rewards/KL_reward/std": 0.01387379877269268, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": 0.0032333284616470337, + "rewards/angle_reward/std": 0.7280095219612122, + "rewards/thinking_verbosity_reward/mean": -1.1364167928695679, + "rewards/thinking_verbosity_reward/std": 0.21204441785812378, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 492.0546875, + "epoch": 0.4557823129251701, + "grad_norm": 0.022933771833777428, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 48113244.0, + "rewards/KL_reward/mean": -0.02552301064133644, + "rewards/KL_reward/std": 0.01558267418295145, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.08489933609962463, + "rewards/angle_reward/std": 0.7113597393035889, + "rewards/thinking_verbosity_reward/mean": -1.092618703842163, + "rewards/thinking_verbosity_reward/std": 0.1845749169588089, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 473.4375, + "epoch": 0.45691609977324266, + "grad_norm": 0.02334679663181305, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 48205676.0, + "rewards/KL_reward/mean": -0.02721046656370163, + "rewards/KL_reward/std": 0.01823466457426548, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.02178448811173439, + "rewards/angle_reward/std": 0.6964385509490967, + "rewards/thinking_verbosity_reward/mean": -1.0612900257110596, + "rewards/thinking_verbosity_reward/std": 0.2350475788116455, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 470.71875, + "epoch": 0.4580498866213152, + "grad_norm": 0.024207444861531258, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 48298328.0, + "rewards/KL_reward/mean": -0.026227515190839767, + "rewards/KL_reward/std": 0.01750863343477249, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": 0.08088646084070206, + "rewards/angle_reward/std": 0.6970916390419006, + "rewards/thinking_verbosity_reward/mean": -1.0650391578674316, + "rewards/thinking_verbosity_reward/std": 0.20070774853229523, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 497.03125, + "epoch": 0.45918367346938777, + "grad_norm": 0.026575271040201187, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 48394196.0, + "rewards/KL_reward/mean": -0.02246716618537903, + "rewards/KL_reward/std": 0.01457224227488041, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.10408850014209747, + "rewards/angle_reward/std": 0.7194622755050659, + "rewards/thinking_verbosity_reward/mean": -1.0898295640945435, + "rewards/thinking_verbosity_reward/std": 0.2296569049358368, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 560.3984375, + "epoch": 0.4603174603174603, + "grad_norm": 0.02821190282702446, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 48498383.0, + "rewards/KL_reward/mean": -0.02293389104306698, + "rewards/KL_reward/std": 0.016351953148841858, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.11334050446748734, + "rewards/angle_reward/std": 0.7183730602264404, + "rewards/thinking_verbosity_reward/mean": -1.1596062183380127, + "rewards/thinking_verbosity_reward/std": 0.2328803390264511, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 542.1796875, + "epoch": 0.4614512471655329, + "grad_norm": 0.022647786885499954, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 48599854.0, + "rewards/KL_reward/mean": -0.023735279217362404, + "rewards/KL_reward/std": 0.015727022662758827, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": 0.006649286486208439, + "rewards/angle_reward/std": 0.7050229907035828, + "rewards/thinking_verbosity_reward/mean": -1.139451026916504, + "rewards/thinking_verbosity_reward/std": 0.23458538949489594, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 471.328125, + "epoch": 0.46258503401360546, + "grad_norm": 0.02578257955610752, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 48692136.0, + "rewards/KL_reward/mean": -0.025398539379239082, + "rewards/KL_reward/std": 0.016654588282108307, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.019923239946365356, + "rewards/angle_reward/std": 0.7124901413917542, + "rewards/thinking_verbosity_reward/mean": -1.0590758323669434, + "rewards/thinking_verbosity_reward/std": 0.23363560438156128, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 604.5546875, + "epoch": 0.463718820861678, + "grad_norm": 0.022921523079276085, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 48801967.0, + "rewards/KL_reward/mean": -0.023579616099596024, + "rewards/KL_reward/std": 0.0175795741379261, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.05431587994098663, + "rewards/angle_reward/std": 0.6877104043960571, + "rewards/thinking_verbosity_reward/mean": -1.1805675029754639, + "rewards/thinking_verbosity_reward/std": 0.340753436088562, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 494.09375, + "epoch": 0.46485260770975056, + "grad_norm": 0.024589331820607185, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 48896779.0, + "rewards/KL_reward/mean": -0.027412384748458862, + "rewards/KL_reward/std": 0.016354065388441086, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.05277753621339798, + "rewards/angle_reward/std": 0.6910402774810791, + "rewards/thinking_verbosity_reward/mean": -1.088793158531189, + "rewards/thinking_verbosity_reward/std": 0.21807634830474854, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 526.328125, + "epoch": 0.46598639455782315, + "grad_norm": 0.021234115585684776, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 48995669.0, + "rewards/KL_reward/mean": -0.03023386560380459, + "rewards/KL_reward/std": 0.021569764241576195, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.04875587671995163, + "rewards/angle_reward/std": 0.6895540356636047, + "rewards/thinking_verbosity_reward/mean": -1.1237661838531494, + "rewards/thinking_verbosity_reward/std": 0.2254941314458847, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 536.046875, + "epoch": 0.4671201814058957, + "grad_norm": 0.0258078221231699, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49096083.0, + "rewards/KL_reward/mean": -0.02499195747077465, + "rewards/KL_reward/std": 0.01609044522047043, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": 0.021279610693454742, + "rewards/angle_reward/std": 0.7131556272506714, + "rewards/thinking_verbosity_reward/mean": -1.123023271560669, + "rewards/thinking_verbosity_reward/std": 0.27755653858184814, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 476.859375, + "epoch": 0.46825396825396826, + "grad_norm": 0.03131388872861862, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 49189033.0, + "rewards/KL_reward/mean": -0.028134461492300034, + "rewards/KL_reward/std": 0.018374817445874214, + "rewards/accuracy_reward/mean": 0.2890625, + "rewards/accuracy_reward/std": 0.45510825514793396, + "rewards/angle_reward/mean": -0.025143563747406006, + "rewards/angle_reward/std": 0.7260422706604004, + "rewards/thinking_verbosity_reward/mean": -1.0641911029815674, + "rewards/thinking_verbosity_reward/std": 0.23999130725860596, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 473.953125, + "epoch": 0.46938775510204084, + "grad_norm": 0.023590773344039917, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49281419.0, + "rewards/KL_reward/mean": -0.02742850035429001, + "rewards/KL_reward/std": 0.02026979625225067, + "rewards/accuracy_reward/mean": 0.3203125, + "rewards/accuracy_reward/std": 0.4684300124645233, + "rewards/angle_reward/mean": 0.03548327833414078, + "rewards/angle_reward/std": 0.7357466816902161, + "rewards/thinking_verbosity_reward/mean": -1.0584043264389038, + "rewards/thinking_verbosity_reward/std": 0.2503049075603485, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 512.1484375, + "epoch": 0.47052154195011336, + "grad_norm": 0.020758090540766716, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49378894.0, + "rewards/KL_reward/mean": -0.027455642819404602, + "rewards/KL_reward/std": 0.018511703237891197, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": -0.028938554227352142, + "rewards/angle_reward/std": 0.7072205543518066, + "rewards/thinking_verbosity_reward/mean": -1.1127849817276, + "rewards/thinking_verbosity_reward/std": 0.19933627545833588, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 490.8671875, + "epoch": 0.47165532879818595, + "grad_norm": 0.023538529872894287, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 49473717.0, + "rewards/KL_reward/mean": -0.03473803400993347, + "rewards/KL_reward/std": 0.02424466982483864, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": -0.07885561883449554, + "rewards/angle_reward/std": 0.6736251711845398, + "rewards/thinking_verbosity_reward/mean": -1.0767022371292114, + "rewards/thinking_verbosity_reward/std": 0.2567003071308136, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 562.8984375, + "epoch": 0.47278911564625853, + "grad_norm": 0.02368626557290554, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49577232.0, + "rewards/KL_reward/mean": -0.028153687715530396, + "rewards/KL_reward/std": 0.017911860719323158, + "rewards/accuracy_reward/mean": 0.171875, + "rewards/accuracy_reward/std": 0.3787541687488556, + "rewards/angle_reward/mean": -0.01273367926478386, + "rewards/angle_reward/std": 0.705632209777832, + "rewards/thinking_verbosity_reward/mean": -1.150090217590332, + "rewards/thinking_verbosity_reward/std": 0.28758352994918823, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 466.703125, + "epoch": 0.47392290249433106, + "grad_norm": 0.02429896593093872, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 49668962.0, + "rewards/KL_reward/mean": -0.031044352799654007, + "rewards/KL_reward/std": 0.01687506027519703, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.05802838131785393, + "rewards/angle_reward/std": 0.6904194951057434, + "rewards/thinking_verbosity_reward/mean": -1.0635954141616821, + "rewards/thinking_verbosity_reward/std": 0.18243525922298431, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 456.890625, + "epoch": 0.47505668934240364, + "grad_norm": 0.028251731768250465, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 49758980.0, + "rewards/KL_reward/mean": -0.03525323420763016, + "rewards/KL_reward/std": 0.02170429937541485, + "rewards/accuracy_reward/mean": 0.3046875, + "rewards/accuracy_reward/std": 0.46208351850509644, + "rewards/angle_reward/mean": 0.05117577686905861, + "rewards/angle_reward/std": 0.7218906879425049, + "rewards/thinking_verbosity_reward/mean": -1.0450398921966553, + "rewards/thinking_verbosity_reward/std": 0.2190735787153244, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 400.375, + "epoch": 0.47619047619047616, + "grad_norm": 0.02744399756193161, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 49842364.0, + "rewards/KL_reward/mean": -0.039281006902456284, + "rewards/KL_reward/std": 0.025512907654047012, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.04501733183860779, + "rewards/angle_reward/std": 0.6914629936218262, + "rewards/thinking_verbosity_reward/mean": -0.9827430248260498, + "rewards/thinking_verbosity_reward/std": 0.1813047230243683, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 457.8671875, + "epoch": 0.47732426303854875, + "grad_norm": 0.024862121790647507, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 49932675.0, + "rewards/KL_reward/mean": -0.033260006457567215, + "rewards/KL_reward/std": 0.017655346542596817, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": 0.023770904168486595, + "rewards/angle_reward/std": 0.7306255102157593, + "rewards/thinking_verbosity_reward/mean": -1.0440349578857422, + "rewards/thinking_verbosity_reward/std": 0.22932352125644684, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 471.09375, + "epoch": 0.47845804988662133, + "grad_norm": 0.029904166236519814, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50024623.0, + "rewards/KL_reward/mean": -0.03850427269935608, + "rewards/KL_reward/std": 0.030644718557596207, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.013057619333267212, + "rewards/angle_reward/std": 0.6803799271583557, + "rewards/thinking_verbosity_reward/mean": -1.0541119575500488, + "rewards/thinking_verbosity_reward/std": 0.25413978099823, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 451.3984375, + "epoch": 0.47959183673469385, + "grad_norm": 0.0292107742279768, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 50114562.0, + "rewards/KL_reward/mean": -0.03692799434065819, + "rewards/KL_reward/std": 0.021713055670261383, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -9.727664291858673e-05, + "rewards/angle_reward/std": 0.7122227549552917, + "rewards/thinking_verbosity_reward/mean": -1.0429333448410034, + "rewards/thinking_verbosity_reward/std": 0.1964370459318161, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 513.953125, + "epoch": 0.48072562358276644, + "grad_norm": 0.02299739606678486, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 50212252.0, + "rewards/KL_reward/mean": -0.0356745719909668, + "rewards/KL_reward/std": 0.024989625439047813, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.040337808430194855, + "rewards/angle_reward/std": 0.7102200388908386, + "rewards/thinking_verbosity_reward/mean": -1.1041864156723022, + "rewards/thinking_verbosity_reward/std": 0.25233370065689087, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 404.171875, + "epoch": 0.481859410430839, + "grad_norm": 0.024677474051713943, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50295098.0, + "rewards/KL_reward/mean": -0.039033617824316025, + "rewards/KL_reward/std": 0.025141440331935883, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.04172428324818611, + "rewards/angle_reward/std": 0.7007785439491272, + "rewards/thinking_verbosity_reward/mean": -0.9804962277412415, + "rewards/thinking_verbosity_reward/std": 0.21661804616451263, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 393.8046875, + "epoch": 0.48299319727891155, + "grad_norm": 0.02994668297469616, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50377305.0, + "rewards/KL_reward/mean": -0.04187620431184769, + "rewards/KL_reward/std": 0.027131345123052597, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.06471440196037292, + "rewards/angle_reward/std": 0.6868351101875305, + "rewards/thinking_verbosity_reward/mean": -0.96719890832901, + "rewards/thinking_verbosity_reward/std": 0.21648328006267548, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 444.3515625, + "epoch": 0.48412698412698413, + "grad_norm": 0.025785937905311584, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 50466102.0, + "rewards/KL_reward/mean": -0.0387788824737072, + "rewards/KL_reward/std": 0.018455451354384422, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": 0.012625513598322868, + "rewards/angle_reward/std": 0.693104088306427, + "rewards/thinking_verbosity_reward/mean": -1.03594970703125, + "rewards/thinking_verbosity_reward/std": 0.18826691806316376, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 509.0234375, + "epoch": 0.4852607709750567, + "grad_norm": 0.02384641021490097, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 50563505.0, + "rewards/KL_reward/mean": -0.03446277976036072, + "rewards/KL_reward/std": 0.02286764234304428, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.04777942970395088, + "rewards/angle_reward/std": 0.6951702237129211, + "rewards/thinking_verbosity_reward/mean": -1.0935171842575073, + "rewards/thinking_verbosity_reward/std": 0.2736482620239258, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 389.0859375, + "epoch": 0.48639455782312924, + "grad_norm": 0.032591525465250015, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 50645268.0, + "rewards/KL_reward/mean": -0.04487357661128044, + "rewards/KL_reward/std": 0.025163918733596802, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": 0.011689020320773125, + "rewards/angle_reward/std": 0.6959567070007324, + "rewards/thinking_verbosity_reward/mean": -0.9662455916404724, + "rewards/thinking_verbosity_reward/std": 0.192021906375885, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 416.7265625, + "epoch": 0.4875283446712018, + "grad_norm": 0.030018575489521027, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50730665.0, + "rewards/KL_reward/mean": -0.046098943799734116, + "rewards/KL_reward/std": 0.026156146079301834, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.05590461194515228, + "rewards/angle_reward/std": 0.6785492300987244, + "rewards/thinking_verbosity_reward/mean": -0.9995837211608887, + "rewards/thinking_verbosity_reward/std": 0.2010611891746521, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 479.203125, + "epoch": 0.4886621315192744, + "grad_norm": 0.022822268307209015, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 50824171.0, + "rewards/KL_reward/mean": -0.0350266769528389, + "rewards/KL_reward/std": 0.024317413568496704, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.02126418985426426, + "rewards/angle_reward/std": 0.7227473258972168, + "rewards/thinking_verbosity_reward/mean": -1.0733683109283447, + "rewards/thinking_verbosity_reward/std": 0.2090698778629303, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 350.5859375, + "epoch": 0.4897959183673469, + "grad_norm": 0.03339194133877754, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 50901174.0, + "rewards/KL_reward/mean": -0.04047293961048126, + "rewards/KL_reward/std": 0.027730628848075867, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": -0.03062419593334198, + "rewards/angle_reward/std": 0.686697244644165, + "rewards/thinking_verbosity_reward/mean": -0.9178509712219238, + "rewards/thinking_verbosity_reward/std": 0.1779995560646057, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 422.5546875, + "epoch": 0.4909297052154195, + "grad_norm": 0.028161540627479553, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 50987429.0, + "rewards/KL_reward/mean": -0.04439615458250046, + "rewards/KL_reward/std": 0.025982331484556198, + "rewards/accuracy_reward/mean": 0.1015625, + "rewards/accuracy_reward/std": 0.3032590448856354, + "rewards/angle_reward/mean": -0.004183335229754448, + "rewards/angle_reward/std": 0.7039921879768372, + "rewards/thinking_verbosity_reward/mean": -1.012196660041809, + "rewards/thinking_verbosity_reward/std": 0.17203310132026672, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 409.03125, + "epoch": 0.49206349206349204, + "grad_norm": 0.025642655789852142, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 51071617.0, + "rewards/KL_reward/mean": -0.0451393648982048, + "rewards/KL_reward/std": 0.026113361120224, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": 0.014498209580779076, + "rewards/angle_reward/std": 0.6932234764099121, + "rewards/thinking_verbosity_reward/mean": -0.9939348697662354, + "rewards/thinking_verbosity_reward/std": 0.18006928265094757, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 411.46875, + "epoch": 0.4931972789115646, + "grad_norm": 0.028984270989894867, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51156709.0, + "rewards/KL_reward/mean": -0.04902590066194534, + "rewards/KL_reward/std": 0.02902785688638687, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.001699242740869522, + "rewards/angle_reward/std": 0.7301017045974731, + "rewards/thinking_verbosity_reward/mean": -0.990402102470398, + "rewards/thinking_verbosity_reward/std": 0.2136627733707428, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 421.859375, + "epoch": 0.4943310657596372, + "grad_norm": 0.02719872258603573, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51242339.0, + "rewards/KL_reward/mean": -0.04764976352453232, + "rewards/KL_reward/std": 0.02956109680235386, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": -0.04444997012615204, + "rewards/angle_reward/std": 0.682907223701477, + "rewards/thinking_verbosity_reward/mean": -1.0076520442962646, + "rewards/thinking_verbosity_reward/std": 0.19254931807518005, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 403.5546875, + "epoch": 0.4954648526077097, + "grad_norm": 0.02942391112446785, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51326618.0, + "rewards/KL_reward/mean": -0.052202943712472916, + "rewards/KL_reward/std": 0.03445935621857643, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.0014247801154851913, + "rewards/angle_reward/std": 0.695796012878418, + "rewards/thinking_verbosity_reward/mean": -0.985343337059021, + "rewards/thinking_verbosity_reward/std": 0.18911543488502502, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 356.40625, + "epoch": 0.4965986394557823, + "grad_norm": 0.03621532395482063, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 51404134.0, + "rewards/KL_reward/mean": -0.05607658624649048, + "rewards/KL_reward/std": 0.0350998118519783, + "rewards/accuracy_reward/mean": 0.1796875, + "rewards/accuracy_reward/std": 0.3854354918003082, + "rewards/angle_reward/mean": -0.01746700517833233, + "rewards/angle_reward/std": 0.7109165191650391, + "rewards/thinking_verbosity_reward/mean": -0.9277302622795105, + "rewards/thinking_verbosity_reward/std": 0.16754429042339325, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 392.90625, + "epoch": 0.4977324263038549, + "grad_norm": 0.03103005886077881, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 51486514.0, + "rewards/KL_reward/mean": -0.057342153042554855, + "rewards/KL_reward/std": 0.03963745757937431, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": -0.06879903376102448, + "rewards/angle_reward/std": 0.6926581263542175, + "rewards/thinking_verbosity_reward/mean": -0.9679933190345764, + "rewards/thinking_verbosity_reward/std": 0.207347109913826, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 392.453125, + "epoch": 0.4988662131519274, + "grad_norm": 0.03392069414258003, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51568908.0, + "rewards/KL_reward/mean": -0.05464167892932892, + "rewards/KL_reward/std": 0.024992190301418304, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.047817666083574295, + "rewards/angle_reward/std": 0.6898568272590637, + "rewards/thinking_verbosity_reward/mean": -0.9665283560752869, + "rewards/thinking_verbosity_reward/std": 0.2116737812757492, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 396.09375, + "epoch": 0.5, + "grad_norm": 0.03598444536328316, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 51650688.0, + "rewards/KL_reward/mean": -0.061555199325084686, + "rewards/KL_reward/std": 0.03372234106063843, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.006012503057718277, + "rewards/angle_reward/std": 0.699260413646698, + "rewards/thinking_verbosity_reward/mean": -0.9670593738555908, + "rewards/thinking_verbosity_reward/std": 0.22992496192455292, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 382.9296875, + "epoch": 0.5011337868480725, + "grad_norm": 0.02516038529574871, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51731551.0, + "rewards/KL_reward/mean": -0.06171301752328873, + "rewards/KL_reward/std": 0.036923423409461975, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.024304382503032684, + "rewards/angle_reward/std": 0.6884368062019348, + "rewards/thinking_verbosity_reward/mean": -0.95870041847229, + "rewards/thinking_verbosity_reward/std": 0.18973399698734283, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 399.3359375, + "epoch": 0.5022675736961452, + "grad_norm": 0.030778488144278526, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51815138.0, + "rewards/KL_reward/mean": -0.0707942545413971, + "rewards/KL_reward/std": 0.04487849026918411, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.194504976272583, + "rewards/angle_reward/mean": -0.0013290448114275932, + "rewards/angle_reward/std": 0.6961056590080261, + "rewards/thinking_verbosity_reward/mean": -0.9707742929458618, + "rewards/thinking_verbosity_reward/std": 0.23211538791656494, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 403.15625, + "epoch": 0.5034013605442177, + "grad_norm": 0.034018371254205704, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 51898326.0, + "rewards/KL_reward/mean": -0.064726322889328, + "rewards/KL_reward/std": 0.03655809909105301, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": 0.035316262394189835, + "rewards/angle_reward/std": 0.6933196783065796, + "rewards/thinking_verbosity_reward/mean": -0.9698935747146606, + "rewards/thinking_verbosity_reward/std": 0.25535571575164795, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 342.8046875, + "epoch": 0.5045351473922902, + "grad_norm": 0.03644280880689621, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 51974157.0, + "rewards/KL_reward/mean": -0.07590562105178833, + "rewards/KL_reward/std": 0.03883231058716774, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": 0.008078407496213913, + "rewards/angle_reward/std": 0.7256864905357361, + "rewards/thinking_verbosity_reward/mean": -0.9066504240036011, + "rewards/thinking_verbosity_reward/std": 0.18092256784439087, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 354.25, + "epoch": 0.5056689342403629, + "grad_norm": 0.034074749797582626, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 52051261.0, + "rewards/KL_reward/mean": -0.06324449926614761, + "rewards/KL_reward/std": 0.028212472796440125, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.0013993959873914719, + "rewards/angle_reward/std": 0.7149010896682739, + "rewards/thinking_verbosity_reward/mean": -0.9295545220375061, + "rewards/thinking_verbosity_reward/std": 0.13850507140159607, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 381.3984375, + "epoch": 0.5068027210884354, + "grad_norm": 0.0305367149412632, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52131848.0, + "rewards/KL_reward/mean": -0.06172921508550644, + "rewards/KL_reward/std": 0.04027363657951355, + "rewards/accuracy_reward/mean": 0.234375, + "rewards/accuracy_reward/std": 0.42527204751968384, + "rewards/angle_reward/mean": -0.05292728170752525, + "rewards/angle_reward/std": 0.6935394406318665, + "rewards/thinking_verbosity_reward/mean": -0.9622151851654053, + "rewards/thinking_verbosity_reward/std": 0.15917249023914337, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 367.2734375, + "epoch": 0.5079365079365079, + "grad_norm": 0.03114541992545128, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52210963.0, + "rewards/KL_reward/mean": -0.06039270758628845, + "rewards/KL_reward/std": 0.03316747024655342, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": -0.024122249335050583, + "rewards/angle_reward/std": 0.7034289836883545, + "rewards/thinking_verbosity_reward/mean": -0.9365625977516174, + "rewards/thinking_verbosity_reward/std": 0.19702234864234924, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 332.9921875, + "epoch": 0.5090702947845805, + "grad_norm": 0.030634764581918716, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 52285426.0, + "rewards/KL_reward/mean": -0.08006307482719421, + "rewards/KL_reward/std": 0.049913886934518814, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": -0.02648836001753807, + "rewards/angle_reward/std": 0.6740854382514954, + "rewards/thinking_verbosity_reward/mean": -0.8969378471374512, + "rewards/thinking_verbosity_reward/std": 0.1602609008550644, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 388.625, + "epoch": 0.5102040816326531, + "grad_norm": 0.03345860168337822, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 52366106.0, + "rewards/KL_reward/mean": -0.07310838997364044, + "rewards/KL_reward/std": 0.037350211292505264, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.10642776638269424, + "rewards/angle_reward/std": 0.6874831914901733, + "rewards/thinking_verbosity_reward/mean": -0.959571361541748, + "rewards/thinking_verbosity_reward/std": 0.22060275077819824, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 386.3125, + "epoch": 0.5113378684807256, + "grad_norm": 0.03417220711708069, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52447378.0, + "rewards/KL_reward/mean": -0.06972689926624298, + "rewards/KL_reward/std": 0.038145698606967926, + "rewards/accuracy_reward/mean": 0.1953125, + "rewards/accuracy_reward/std": 0.3979988098144531, + "rewards/angle_reward/mean": 0.06792110949754715, + "rewards/angle_reward/std": 0.7083807587623596, + "rewards/thinking_verbosity_reward/mean": -0.9497437477111816, + "rewards/thinking_verbosity_reward/std": 0.2483774721622467, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 357.0703125, + "epoch": 0.5124716553287982, + "grad_norm": 0.03606312349438667, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 52525219.0, + "rewards/KL_reward/mean": -0.09044323861598969, + "rewards/KL_reward/std": 0.057571690529584885, + "rewards/accuracy_reward/mean": 0.25, + "rewards/accuracy_reward/std": 0.434714138507843, + "rewards/angle_reward/mean": -0.01749390922486782, + "rewards/angle_reward/std": 0.7063094973564148, + "rewards/thinking_verbosity_reward/mean": -0.9207316040992737, + "rewards/thinking_verbosity_reward/std": 0.20679256319999695, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 316.4140625, + "epoch": 0.5136054421768708, + "grad_norm": 0.038564532995224, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 52598072.0, + "rewards/KL_reward/mean": -0.09466668218374252, + "rewards/KL_reward/std": 0.04971272870898247, + "rewards/accuracy_reward/mean": 0.109375, + "rewards/accuracy_reward/std": 0.31333550810813904, + "rewards/angle_reward/mean": -0.06022874265909195, + "rewards/angle_reward/std": 0.7210524082183838, + "rewards/thinking_verbosity_reward/mean": -0.8652991056442261, + "rewards/thinking_verbosity_reward/std": 0.20021560788154602, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 372.609375, + "epoch": 0.5147392290249433, + "grad_norm": 0.035059019923210144, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 52677518.0, + "rewards/KL_reward/mean": -0.08334328979253769, + "rewards/KL_reward/std": 0.0473569817841053, + "rewards/accuracy_reward/mean": 0.15625, + "rewards/accuracy_reward/std": 0.3645188808441162, + "rewards/angle_reward/mean": -0.015776190906763077, + "rewards/angle_reward/std": 0.6482148766517639, + "rewards/thinking_verbosity_reward/mean": -0.937674880027771, + "rewards/thinking_verbosity_reward/std": 0.22401179373264313, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 391.765625, + "epoch": 0.5158730158730159, + "grad_norm": 0.031195441260933876, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 52759248.0, + "rewards/KL_reward/mean": -0.07809939235448837, + "rewards/KL_reward/std": 0.04455532878637314, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": 0.008754374459385872, + "rewards/angle_reward/std": 0.6928679347038269, + "rewards/thinking_verbosity_reward/mean": -0.9701083898544312, + "rewards/thinking_verbosity_reward/std": 0.18991024792194366, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 310.3671875, + "epoch": 0.5170068027210885, + "grad_norm": 0.04037066176533699, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 52830615.0, + "rewards/KL_reward/mean": -0.10021911561489105, + "rewards/KL_reward/std": 0.05514080449938774, + "rewards/accuracy_reward/mean": 0.140625, + "rewards/accuracy_reward/std": 0.3490002751350403, + "rewards/angle_reward/mean": -0.06877849251031876, + "rewards/angle_reward/std": 0.6966662406921387, + "rewards/thinking_verbosity_reward/mean": -0.8576784133911133, + "rewards/thinking_verbosity_reward/std": 0.19515058398246765, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 298.53125, + "epoch": 0.518140589569161, + "grad_norm": 0.03613923117518425, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52900531.0, + "rewards/KL_reward/mean": -0.09775243699550629, + "rewards/KL_reward/std": 0.04872914031147957, + "rewards/accuracy_reward/mean": 0.1640625, + "rewards/accuracy_reward/std": 0.371787428855896, + "rewards/angle_reward/mean": 0.009356520138680935, + "rewards/angle_reward/std": 0.7003117799758911, + "rewards/thinking_verbosity_reward/mean": -0.8445853590965271, + "rewards/thinking_verbosity_reward/std": 0.1752832531929016, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 317.5703125, + "epoch": 0.5192743764172335, + "grad_norm": 0.036219146102666855, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 52973124.0, + "rewards/KL_reward/mean": -0.1005428358912468, + "rewards/KL_reward/std": 0.0575571209192276, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.03271003067493439, + "rewards/angle_reward/std": 0.6863457560539246, + "rewards/thinking_verbosity_reward/mean": -0.8697078227996826, + "rewards/thinking_verbosity_reward/std": 0.18790937960147858, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 308.2890625, + "epoch": 0.5204081632653061, + "grad_norm": 0.034134335815906525, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53044521.0, + "rewards/KL_reward/mean": -0.1032245084643364, + "rewards/KL_reward/std": 0.057847797870635986, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.043558768928050995, + "rewards/angle_reward/std": 0.6570540070533752, + "rewards/thinking_verbosity_reward/mean": -0.8615978956222534, + "rewards/thinking_verbosity_reward/std": 0.16141784191131592, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 346.296875, + "epoch": 0.5215419501133787, + "grad_norm": 0.03669775277376175, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53120535.0, + "rewards/KL_reward/mean": -0.10197243839502335, + "rewards/KL_reward/std": 0.06329569220542908, + "rewards/accuracy_reward/mean": 0.1484375, + "rewards/accuracy_reward/std": 0.356930136680603, + "rewards/angle_reward/mean": -0.01836412027478218, + "rewards/angle_reward/std": 0.7148515582084656, + "rewards/thinking_verbosity_reward/mean": -0.8993373513221741, + "rewards/thinking_verbosity_reward/std": 0.23422886431217194, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 255.59375, + "epoch": 0.5226757369614512, + "grad_norm": 0.04212069511413574, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53184771.0, + "rewards/KL_reward/mean": -0.13540199398994446, + "rewards/KL_reward/std": 0.06138134002685547, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.1746762990951538, + "rewards/angle_reward/mean": -0.05444896221160889, + "rewards/angle_reward/std": 0.7013072967529297, + "rewards/thinking_verbosity_reward/mean": -0.7881078124046326, + "rewards/thinking_verbosity_reward/std": 0.12446502596139908, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 282.8515625, + "epoch": 0.5238095238095238, + "grad_norm": 0.04676346480846405, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53253496.0, + "rewards/KL_reward/mean": -0.12668287754058838, + "rewards/KL_reward/std": 0.0727287158370018, + "rewards/accuracy_reward/mean": 0.09375, + "rewards/accuracy_reward/std": 0.29262590408325195, + "rewards/angle_reward/mean": -0.09203147888183594, + "rewards/angle_reward/std": 0.6941730380058289, + "rewards/thinking_verbosity_reward/mean": -0.8155484199523926, + "rewards/thinking_verbosity_reward/std": 0.19955144822597504, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 267.4765625, + "epoch": 0.5249433106575964, + "grad_norm": 0.05062921345233917, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53319877.0, + "rewards/KL_reward/mean": -0.1394931823015213, + "rewards/KL_reward/std": 0.05866050720214844, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15188287198543549, + "rewards/angle_reward/mean": -0.004266105592250824, + "rewards/angle_reward/std": 0.7063484191894531, + "rewards/thinking_verbosity_reward/mean": -0.8024481534957886, + "rewards/thinking_verbosity_reward/std": 0.14974668622016907, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 270.5390625, + "epoch": 0.5260770975056689, + "grad_norm": 0.03620358929038048, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53386546.0, + "rewards/KL_reward/mean": -0.1303442418575287, + "rewards/KL_reward/std": 0.06146172061562538, + "rewards/accuracy_reward/mean": 0.125, + "rewards/accuracy_reward/std": 0.3320184051990509, + "rewards/angle_reward/mean": -0.04130447655916214, + "rewards/angle_reward/std": 0.6710583567619324, + "rewards/thinking_verbosity_reward/mean": -0.8068867921829224, + "rewards/thinking_verbosity_reward/std": 0.15152832865715027, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 278.0625, + "epoch": 0.5272108843537415, + "grad_norm": 0.04413303732872009, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53454690.0, + "rewards/KL_reward/mean": -0.15620750188827515, + "rewards/KL_reward/std": 0.07933449745178223, + "rewards/accuracy_reward/mean": 0.1875, + "rewards/accuracy_reward/std": 0.39184603095054626, + "rewards/angle_reward/mean": 0.01784469187259674, + "rewards/angle_reward/std": 0.6926581859588623, + "rewards/thinking_verbosity_reward/mean": -0.8130091428756714, + "rewards/thinking_verbosity_reward/std": 0.17866672575473785, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 231.09375, + "epoch": 0.528344671201814, + "grad_norm": 0.05709443241357803, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53516454.0, + "rewards/KL_reward/mean": -0.14998161792755127, + "rewards/KL_reward/std": 0.086819127202034, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21220162510871887, + "rewards/angle_reward/mean": -0.05606694519519806, + "rewards/angle_reward/std": 0.6755439043045044, + "rewards/thinking_verbosity_reward/mean": -0.7366902232170105, + "rewards/thinking_verbosity_reward/std": 0.18104685842990875, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 264.4453125, + "epoch": 0.5294784580498866, + "grad_norm": 0.06272619962692261, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53581967.0, + "rewards/KL_reward/mean": -0.14781680703163147, + "rewards/KL_reward/std": 0.08237636089324951, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.061271876096725464, + "rewards/angle_reward/std": 0.6824977993965149, + "rewards/thinking_verbosity_reward/mean": -0.79157555103302, + "rewards/thinking_verbosity_reward/std": 0.17959390580654144, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 274.828125, + "epoch": 0.5306122448979592, + "grad_norm": 0.07420215010643005, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53649417.0, + "rewards/KL_reward/mean": -0.16903798282146454, + "rewards/KL_reward/std": 0.08875898271799088, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": -0.021459292620420456, + "rewards/angle_reward/std": 0.7030213475227356, + "rewards/thinking_verbosity_reward/mean": -0.8034688234329224, + "rewards/thinking_verbosity_reward/std": 0.19828097522258759, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 237.3828125, + "epoch": 0.5317460317460317, + "grad_norm": 0.04708437994122505, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53711346.0, + "rewards/KL_reward/mean": -0.17129938304424286, + "rewards/KL_reward/std": 0.09669040143489838, + "rewards/accuracy_reward/mean": 0.2265625, + "rewards/accuracy_reward/std": 0.4202519655227661, + "rewards/angle_reward/mean": -0.0005590301007032394, + "rewards/angle_reward/std": 0.702396035194397, + "rewards/thinking_verbosity_reward/mean": -0.7507597208023071, + "rewards/thinking_verbosity_reward/std": 0.1658087521791458, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 206.328125, + "epoch": 0.5328798185941043, + "grad_norm": 0.07587230950593948, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53770020.0, + "rewards/KL_reward/mean": -0.21926406025886536, + "rewards/KL_reward/std": 0.10634764283895493, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.0663021057844162, + "rewards/angle_reward/std": 0.7422232031822205, + "rewards/thinking_verbosity_reward/mean": -0.7035611867904663, + "rewards/thinking_verbosity_reward/std": 0.1358904093503952, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 214.9140625, + "epoch": 0.5340136054421769, + "grad_norm": 0.07078830152750015, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53829705.0, + "rewards/KL_reward/mean": -0.24120840430259705, + "rewards/KL_reward/std": 0.17651721835136414, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24301259219646454, + "rewards/angle_reward/mean": -0.03783184662461281, + "rewards/angle_reward/std": 0.6946678757667542, + "rewards/thinking_verbosity_reward/mean": -0.7069072723388672, + "rewards/thinking_verbosity_reward/std": 0.18784119188785553, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 216.9140625, + "epoch": 0.5351473922902494, + "grad_norm": 0.09605925530195236, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53889510.0, + "rewards/KL_reward/mean": -0.25455862283706665, + "rewards/KL_reward/std": 0.2185129076242447, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.194504976272583, + "rewards/angle_reward/mean": -0.03943023830652237, + "rewards/angle_reward/std": 0.7367250323295593, + "rewards/thinking_verbosity_reward/mean": -0.7035454511642456, + "rewards/thinking_verbosity_reward/std": 0.21246668696403503, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 171.3125, + "epoch": 0.536281179138322, + "grad_norm": 0.09049979597330093, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 53943630.0, + "rewards/KL_reward/mean": -0.277265727519989, + "rewards/KL_reward/std": 0.13049979507923126, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": -0.06564237922430038, + "rewards/angle_reward/std": 0.7101609110832214, + "rewards/thinking_verbosity_reward/mean": -0.6378944516181946, + "rewards/thinking_verbosity_reward/std": 0.13784360885620117, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 148.8125, + "epoch": 0.5374149659863946, + "grad_norm": 0.11495912820100784, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 53994934.0, + "rewards/KL_reward/mean": -0.34224867820739746, + "rewards/KL_reward/std": 0.19699767231941223, + "rewards/accuracy_reward/mean": 0.0390625, + "rewards/accuracy_reward/std": 0.194504976272583, + "rewards/angle_reward/mean": -0.008879505097866058, + "rewards/angle_reward/std": 0.6997842788696289, + "rewards/thinking_verbosity_reward/mean": -0.5875452160835266, + "rewards/thinking_verbosity_reward/std": 0.15656743943691254, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 129.625, + "epoch": 0.5385487528344671, + "grad_norm": 0.1606689691543579, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54043854.0, + "rewards/KL_reward/mean": -0.4127693474292755, + "rewards/KL_reward/std": 0.2230147123336792, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.1325501799583435, + "rewards/angle_reward/std": 0.7190689444541931, + "rewards/thinking_verbosity_reward/mean": -0.547183632850647, + "rewards/thinking_verbosity_reward/std": 0.14929042756557465, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 140.9296875, + "epoch": 0.5396825396825397, + "grad_norm": 0.13437631726264954, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54094013.0, + "rewards/KL_reward/mean": -0.4838368892669678, + "rewards/KL_reward/std": 0.4108221232891083, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.0751531645655632, + "rewards/angle_reward/std": 0.7036384344100952, + "rewards/thinking_verbosity_reward/mean": -0.5652834177017212, + "rewards/thinking_verbosity_reward/std": 0.17469199001789093, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 114.9453125, + "epoch": 0.5408163265306123, + "grad_norm": 0.22570781409740448, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54140718.0, + "rewards/KL_reward/mean": -0.5766152143478394, + "rewards/KL_reward/std": 0.5207230448722839, + "rewards/accuracy_reward/mean": 0.0546875, + "rewards/accuracy_reward/std": 0.22826264798641205, + "rewards/angle_reward/mean": -0.05803005397319794, + "rewards/angle_reward/std": 0.7059148550033569, + "rewards/thinking_verbosity_reward/mean": -0.5080236196517944, + "rewards/thinking_verbosity_reward/std": 0.16427448391914368, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 90.3671875, + "epoch": 0.5419501133786848, + "grad_norm": 0.5434188842773438, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54184253.0, + "rewards/KL_reward/mean": -0.7356384992599487, + "rewards/KL_reward/std": 0.6622855067253113, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": -0.10792765021324158, + "rewards/angle_reward/std": 0.7056306004524231, + "rewards/thinking_verbosity_reward/mean": -0.4460752606391907, + "rewards/thinking_verbosity_reward/std": 0.15693074464797974, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 83.7890625, + "epoch": 0.5430839002267573, + "grad_norm": 0.6073722243309021, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54227074.0, + "rewards/KL_reward/mean": -0.9975966215133667, + "rewards/KL_reward/std": 1.713445782661438, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.06350193172693253, + "rewards/angle_reward/std": 0.7050646543502808, + "rewards/thinking_verbosity_reward/mean": -0.4233494699001312, + "rewards/thinking_verbosity_reward/std": 0.167231485247612, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 37.3046875, + "epoch": 0.54421768707483, + "grad_norm": 1.9195548295974731, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54264161.0, + "rewards/KL_reward/mean": -1.834166169166565, + "rewards/KL_reward/std": 1.3385273218154907, + "rewards/accuracy_reward/mean": 0.078125, + "rewards/accuracy_reward/std": 0.2694226801395416, + "rewards/angle_reward/mean": 0.05574992671608925, + "rewards/angle_reward/std": 0.711519181728363, + "rewards/thinking_verbosity_reward/mean": -0.2586079239845276, + "rewards/thinking_verbosity_reward/std": 0.15521405637264252, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 22.3984375, + "epoch": 0.5453514739229025, + "grad_norm": 0.7810635566711426, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54299196.0, + "rewards/KL_reward/mean": -2.171604871749878, + "rewards/KL_reward/std": 1.0320184230804443, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12450689822435379, + "rewards/angle_reward/mean": 0.13008692860603333, + "rewards/angle_reward/std": 0.6445255875587463, + "rewards/thinking_verbosity_reward/mean": -0.1941802203655243, + "rewards/thinking_verbosity_reward/std": 0.12615256011486053, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 8.65625, + "epoch": 0.546485260770975, + "grad_norm": 1.0425291061401367, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54331920.0, + "rewards/KL_reward/mean": -2.89699125289917, + "rewards/KL_reward/std": 1.0191465616226196, + "rewards/accuracy_reward/mean": 0.1328125, + "rewards/accuracy_reward/std": 0.3407054841518402, + "rewards/angle_reward/mean": 0.11439374834299088, + "rewards/angle_reward/std": 0.6321514844894409, + "rewards/thinking_verbosity_reward/mean": -0.13150997459888458, + "rewards/thinking_verbosity_reward/std": 0.04313100129365921, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 8.6328125, + "epoch": 0.5476190476190477, + "grad_norm": 0.6215812563896179, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54365409.0, + "rewards/KL_reward/mean": -3.594916582107544, + "rewards/KL_reward/std": 1.5705430507659912, + "rewards/accuracy_reward/mean": 0.03125, + "rewards/accuracy_reward/std": 0.1746762990951538, + "rewards/angle_reward/mean": -0.04716882482171059, + "rewards/angle_reward/std": 0.5511569380760193, + "rewards/thinking_verbosity_reward/mean": -0.12736451625823975, + "rewards/thinking_verbosity_reward/std": 0.05369199439883232, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 7.5703125, + "epoch": 0.5487528344671202, + "grad_norm": 0.5058205127716064, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54398402.0, + "rewards/KL_reward/mean": -3.5983033180236816, + "rewards/KL_reward/std": 0.9799101948738098, + "rewards/accuracy_reward/mean": 0.046875, + "rewards/accuracy_reward/std": 0.21220162510871887, + "rewards/angle_reward/mean": -0.014187419787049294, + "rewards/angle_reward/std": 0.6621562242507935, + "rewards/thinking_verbosity_reward/mean": -0.1267462968826294, + "rewards/thinking_verbosity_reward/std": 0.019078785553574562, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 7.640625, + "epoch": 0.5498866213151927, + "grad_norm": 0.5460374355316162, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54431804.0, + "rewards/KL_reward/mean": -3.4369935989379883, + "rewards/KL_reward/std": 1.0887336730957031, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": 0.017852269113063812, + "rewards/angle_reward/std": 0.6593750715255737, + "rewards/thinking_verbosity_reward/mean": -0.1271992027759552, + "rewards/thinking_verbosity_reward/std": 0.020621543750166893, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 7.5390625, + "epoch": 0.5510204081632653, + "grad_norm": 0.5307416915893555, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54464305.0, + "rewards/KL_reward/mean": -3.6182875633239746, + "rewards/KL_reward/std": 1.1485244035720825, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.0883883461356163, + "rewards/angle_reward/mean": 0.022097047418355942, + "rewards/angle_reward/std": 0.569690465927124, + "rewards/thinking_verbosity_reward/mean": -0.12511225044727325, + "rewards/thinking_verbosity_reward/std": 0.026458468288183212, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 7.4140625, + "epoch": 0.5521541950113379, + "grad_norm": 0.857745349407196, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54497494.0, + "rewards/KL_reward/mean": -3.4195375442504883, + "rewards/KL_reward/std": 0.8222663998603821, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12450689822435379, + "rewards/angle_reward/mean": -0.07733979821205139, + "rewards/angle_reward/std": 0.6351537108421326, + "rewards/thinking_verbosity_reward/mean": -0.12590113282203674, + "rewards/thinking_verbosity_reward/std": 0.014324675314128399, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 7.125, + "epoch": 0.5532879818594104, + "grad_norm": 0.7705725431442261, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54530838.0, + "rewards/KL_reward/mean": -3.646390438079834, + "rewards/KL_reward/std": 1.0909744501113892, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.012675169855356216, + "rewards/angle_reward/std": 0.7345511317253113, + "rewards/thinking_verbosity_reward/mean": -0.12285362184047699, + "rewards/thinking_verbosity_reward/std": 0.01487329788506031, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 6.453125, + "epoch": 0.5544217687074829, + "grad_norm": 1.0522595643997192, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54562448.0, + "rewards/KL_reward/mean": -3.8998897075653076, + "rewards/KL_reward/std": 1.3035476207733154, + "rewards/accuracy_reward/mean": 0.1171875, + "rewards/accuracy_reward/std": 0.322907418012619, + "rewards/angle_reward/mean": -0.05524272099137306, + "rewards/angle_reward/std": 0.5859870910644531, + "rewards/thinking_verbosity_reward/mean": -0.11607244610786438, + "rewards/thinking_verbosity_reward/std": 0.012698753736913204, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 7.515625, + "epoch": 0.5555555555555556, + "grad_norm": 0.7135828137397766, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54594826.0, + "rewards/KL_reward/mean": -3.6429526805877686, + "rewards/KL_reward/std": 2.472461462020874, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12450689822435379, + "rewards/angle_reward/mean": 0.021636418998241425, + "rewards/angle_reward/std": 0.6236943602561951, + "rewards/thinking_verbosity_reward/mean": -0.1257990598678589, + "rewards/thinking_verbosity_reward/std": 0.021617397665977478, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 6.9453125, + "epoch": 0.5566893424036281, + "grad_norm": 0.8383898735046387, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54628283.0, + "rewards/KL_reward/mean": -3.845817804336548, + "rewards/KL_reward/std": 0.9958040118217468, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12450689822435379, + "rewards/angle_reward/mean": -0.1233597844839096, + "rewards/angle_reward/std": 0.6113914251327515, + "rewards/thinking_verbosity_reward/mean": -0.12134502083063126, + "rewards/thinking_verbosity_reward/std": 0.011821961961686611, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 7.03125, + "epoch": 0.5578231292517006, + "grad_norm": 1.1921794414520264, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 54661407.0, + "rewards/KL_reward/mean": -3.6719002723693848, + "rewards/KL_reward/std": 1.2782294750213623, + "rewards/accuracy_reward/mean": 0.0703125, + "rewards/accuracy_reward/std": 0.2566775679588318, + "rewards/angle_reward/mean": -0.06361042708158493, + "rewards/angle_reward/std": 0.6366872191429138, + "rewards/thinking_verbosity_reward/mean": -0.1219245195388794, + "rewards/thinking_verbosity_reward/std": 0.0146359046921134, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 6.4375, + "epoch": 0.5589569160997733, + "grad_norm": 1.3316749334335327, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54694175.0, + "rewards/KL_reward/mean": -4.398461818695068, + "rewards/KL_reward/std": 1.5992577075958252, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.007545974105596542, + "rewards/angle_reward/std": 0.6428928971290588, + "rewards/thinking_verbosity_reward/mean": -0.11585833877325058, + "rewards/thinking_verbosity_reward/std": 0.013112529180943966, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 5.6796875, + "epoch": 0.5600907029478458, + "grad_norm": 2.5037853717803955, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54726870.0, + "rewards/KL_reward/mean": -5.423948764801025, + "rewards/KL_reward/std": 2.345402240753174, + "rewards/accuracy_reward/mean": 0.0625, + "rewards/accuracy_reward/std": 0.24301259219646454, + "rewards/angle_reward/mean": -0.11048542708158493, + "rewards/angle_reward/std": 0.6301949620246887, + "rewards/thinking_verbosity_reward/mean": -0.10674476623535156, + "rewards/thinking_verbosity_reward/std": 0.017526322975754738, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 5.3359375, + "epoch": 0.5612244897959183, + "grad_norm": 2.2523908615112305, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 54759745.0, + "rewards/KL_reward/mean": -6.66871452331543, + "rewards/KL_reward/std": 2.817564010620117, + "rewards/accuracy_reward/mean": 0.0859375, + "rewards/accuracy_reward/std": 0.2813730239868164, + "rewards/angle_reward/mean": 0.011048581451177597, + "rewards/angle_reward/std": 0.6146785616874695, + "rewards/thinking_verbosity_reward/mean": -0.10279585421085358, + "rewards/thinking_verbosity_reward/std": 0.016583282500505447, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.1796875, + "epoch": 0.562358276643991, + "grad_norm": 0.9493520855903625, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 54791880.0, + "rewards/KL_reward/mean": -8.693005561828613, + "rewards/KL_reward/std": 1.4315944910049438, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.06629125773906708, + "rewards/angle_reward/std": 0.3486475646495819, + "rewards/thinking_verbosity_reward/mean": -0.08882145583629608, + "rewards/thinking_verbosity_reward/std": 0.00777425104752183, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0859375, + "epoch": 0.5634920634920635, + "grad_norm": 0.9398501515388489, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_tokens": 54824803.0, + "rewards/KL_reward/mean": -8.986701011657715, + "rewards/KL_reward/std": 1.7665596008300781, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.07733979821205139, + "rewards/angle_reward/std": 0.3463462293148041, + "rewards/thinking_verbosity_reward/mean": -0.08729038387537003, + "rewards/thinking_verbosity_reward/std": 0.009797018021345139, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.564625850340136, + "grad_norm": 0.1411595642566681, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 54857107.0, + "rewards/KL_reward/mean": -9.378817558288574, + "rewards/KL_reward/std": 1.6368354558944702, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5657596371882087, + "grad_norm": 0.12384044378995895, + "learning_rate": 5e-05, + "loss": -0.0022, + "num_tokens": 54890211.0, + "rewards/KL_reward/mean": -10.19272232055664, + "rewards/KL_reward/std": 1.4332489967346191, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0390625, + "epoch": 0.5668934240362812, + "grad_norm": 0.21007029712200165, + "learning_rate": 5e-05, + "loss": 0.0034, + "num_tokens": 54922392.0, + "rewards/KL_reward/mean": -9.341675758361816, + "rewards/KL_reward/std": 1.3299744129180908, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.08688278496265411, + "rewards/thinking_verbosity_reward/std": 0.003170661861076951, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5680272108843537, + "grad_norm": 0.08080130070447922, + "learning_rate": 5e-05, + "loss": -0.0014, + "num_tokens": 54954752.0, + "rewards/KL_reward/mean": -9.528559684753418, + "rewards/KL_reward/std": 1.6406162977218628, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5691609977324263, + "grad_norm": 0.14609335362911224, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 54987352.0, + "rewards/KL_reward/mean": -9.704826354980469, + "rewards/KL_reward/std": 1.48715078830719, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0078125, + "epoch": 0.5702947845804989, + "grad_norm": 0.20738476514816284, + "learning_rate": 5e-05, + "loss": -0.0043, + "num_tokens": 55020329.0, + "rewards/KL_reward/mean": -9.367766380310059, + "rewards/KL_reward/std": 0.917743444442749, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.086707204580307, + "rewards/thinking_verbosity_reward/std": 0.0011841795640066266, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5714285714285714, + "grad_norm": 0.0858779102563858, + "learning_rate": 5e-05, + "loss": -0.0009, + "num_tokens": 55052305.0, + "rewards/KL_reward/mean": -9.519469261169434, + "rewards/KL_reward/std": 1.7006648778915405, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.9921875, + "epoch": 0.572562358276644, + "grad_norm": 0.564150333404541, + "learning_rate": 5e-05, + "loss": 0.0076, + "num_tokens": 55084392.0, + "rewards/KL_reward/mean": -9.431741714477539, + "rewards/KL_reward/std": 1.0106531381607056, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.08647838234901428, + "rewards/thinking_verbosity_reward/std": 0.0014046551659703255, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5736961451247166, + "grad_norm": 0.06539922207593918, + "learning_rate": 5e-05, + "loss": -0.0044, + "num_tokens": 55117504.0, + "rewards/KL_reward/mean": -9.063766479492188, + "rewards/KL_reward/std": 1.2047866582870483, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5748299319727891, + "grad_norm": 0.15550510585308075, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_tokens": 55150368.0, + "rewards/KL_reward/mean": -9.597511291503906, + "rewards/KL_reward/std": 1.5458221435546875, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5759637188208617, + "grad_norm": 0.10294591635465622, + "learning_rate": 5e-05, + "loss": -0.0018, + "num_tokens": 55182768.0, + "rewards/KL_reward/mean": -9.482969284057617, + "rewards/KL_reward/std": 1.1649845838546753, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5770975056689343, + "grad_norm": 0.07005158066749573, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_tokens": 55215344.0, + "rewards/KL_reward/mean": -9.859160423278809, + "rewards/KL_reward/std": 1.0055972337722778, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5782312925170068, + "grad_norm": 0.11765587329864502, + "learning_rate": 5e-05, + "loss": -0.003, + "num_tokens": 55248264.0, + "rewards/KL_reward/mean": -9.879338264465332, + "rewards/KL_reward/std": 1.404909372329712, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5793650793650794, + "grad_norm": 0.11884509772062302, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_tokens": 55281080.0, + "rewards/KL_reward/mean": -9.526758193969727, + "rewards/KL_reward/std": 1.5549665689468384, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0234375, + "epoch": 0.5804988662131519, + "grad_norm": 0.2688693106174469, + "learning_rate": 5e-05, + "loss": -0.001, + "num_tokens": 55313387.0, + "rewards/KL_reward/mean": -9.45893383026123, + "rewards/KL_reward/std": 1.2606397867202759, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.022097086533904076, + "rewards/angle_reward/std": 0.17607934772968292, + "rewards/thinking_verbosity_reward/mean": -0.0868721455335617, + "rewards/thinking_verbosity_reward/std": 0.0034520491026341915, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5816326530612245, + "grad_norm": 0.06777095049619675, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_tokens": 55345827.0, + "rewards/KL_reward/mean": -9.774582862854004, + "rewards/KL_reward/std": 1.4200104475021362, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5827664399092971, + "grad_norm": 0.08141104876995087, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 55378259.0, + "rewards/KL_reward/mean": -9.087238311767578, + "rewards/KL_reward/std": 1.020469069480896, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.015625, + "epoch": 0.5839002267573696, + "grad_norm": 0.3209582567214966, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_tokens": 55410893.0, + "rewards/KL_reward/mean": -9.666126251220703, + "rewards/KL_reward/std": 1.3984832763671875, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.08679942041635513, + "rewards/thinking_verbosity_reward/std": 0.002227462362498045, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5850340136054422, + "grad_norm": 0.11173339933156967, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 55443661.0, + "rewards/KL_reward/mean": -9.571331024169922, + "rewards/KL_reward/std": 0.9361988306045532, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5861678004535147, + "grad_norm": 0.15266042947769165, + "learning_rate": 5e-05, + "loss": -0.0007, + "num_tokens": 55476573.0, + "rewards/KL_reward/mean": -9.854283332824707, + "rewards/KL_reward/std": 1.109019160270691, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5873015873015873, + "grad_norm": 0.2016039490699768, + "learning_rate": 5e-05, + "loss": -0.0027, + "num_tokens": 55509413.0, + "rewards/KL_reward/mean": -9.629522323608398, + "rewards/KL_reward/std": 1.0815303325653076, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5884353741496599, + "grad_norm": 0.07484356313943863, + "learning_rate": 5e-05, + "loss": -0.0012, + "num_tokens": 55541789.0, + "rewards/KL_reward/mean": -8.9932861328125, + "rewards/KL_reward/std": 0.9899211525917053, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.9921875, + "epoch": 0.5895691609977324, + "grad_norm": 0.37136051058769226, + "learning_rate": 5e-05, + "loss": -0.0009, + "num_tokens": 55574068.0, + "rewards/KL_reward/mean": -9.123318672180176, + "rewards/KL_reward/std": 1.024639368057251, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.08647838234901428, + "rewards/thinking_verbosity_reward/std": 0.0014046551659703255, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.590702947845805, + "grad_norm": 0.0857769325375557, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_tokens": 55606356.0, + "rewards/KL_reward/mean": -9.66751766204834, + "rewards/KL_reward/std": 1.013816237449646, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5918367346938775, + "grad_norm": 0.0768849328160286, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_tokens": 55638556.0, + "rewards/KL_reward/mean": -9.529890060424805, + "rewards/KL_reward/std": 0.9348580241203308, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5929705215419501, + "grad_norm": 0.16719675064086914, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 55671244.0, + "rewards/KL_reward/mean": -9.763287544250488, + "rewards/KL_reward/std": 1.6031088829040527, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.9921875, + "epoch": 0.5941043083900227, + "grad_norm": 0.39724478125572205, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 55702843.0, + "rewards/KL_reward/mean": -9.557028770446777, + "rewards/KL_reward/std": 1.890602946281433, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.08647838234901428, + "rewards/thinking_verbosity_reward/std": 0.0014046551659703255, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5952380952380952, + "grad_norm": 0.1424868106842041, + "learning_rate": 5e-05, + "loss": -0.001, + "num_tokens": 55735427.0, + "rewards/KL_reward/mean": -9.618017196655273, + "rewards/KL_reward/std": 1.1586666107177734, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5963718820861678, + "grad_norm": 0.1281495839357376, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 55767619.0, + "rewards/KL_reward/mean": -9.235986709594727, + "rewards/KL_reward/std": 1.3936333656311035, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.9765625, + "epoch": 0.5975056689342404, + "grad_norm": 0.827623188495636, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 55799872.0, + "rewards/KL_reward/mean": -9.611066818237305, + "rewards/KL_reward/std": 1.5234673023223877, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.06629125773906708, + "rewards/angle_reward/std": 0.24199791252613068, + "rewards/thinking_verbosity_reward/mean": -0.08619242906570435, + "rewards/thinking_verbosity_reward/std": 0.003516852855682373, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0234375, + "epoch": 0.5986394557823129, + "grad_norm": 0.4614194631576538, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 55832363.0, + "rewards/KL_reward/mean": -9.693451881408691, + "rewards/KL_reward/std": 1.0000463724136353, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.08688278496265411, + "rewards/thinking_verbosity_reward/std": 0.003170661861076951, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.5997732426303855, + "grad_norm": 0.08846230804920197, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 55864427.0, + "rewards/KL_reward/mean": -9.651050567626953, + "rewards/KL_reward/std": 1.1203112602233887, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.08660253882408142, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.984375, + "epoch": 0.6009070294784581, + "grad_norm": 0.8824671506881714, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 55897281.0, + "rewards/KL_reward/mean": -9.703022956848145, + "rewards/KL_reward/std": 1.6452223062515259, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.06629125773906708, + "rewards/angle_reward/std": 0.24199791252613068, + "rewards/thinking_verbosity_reward/mean": -0.08635422587394714, + "rewards/thinking_verbosity_reward/std": 0.001978646032512188, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.9609375, + "epoch": 0.6020408163265306, + "grad_norm": 0.7833751440048218, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 55929860.0, + "rewards/KL_reward/mean": -10.333253860473633, + "rewards/KL_reward/std": 1.588026762008667, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.13258251547813416, + "rewards/angle_reward/std": 0.3290405869483948, + "rewards/thinking_verbosity_reward/mean": -0.08594411611557007, + "rewards/thinking_verbosity_reward/std": 0.0040097408927977085, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.9921875, + "epoch": 0.6031746031746031, + "grad_norm": 1.585099220275879, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_tokens": 55962315.0, + "rewards/KL_reward/mean": -9.991573333740234, + "rewards/KL_reward/std": 1.682023048400879, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.03314562886953354, + "rewards/angle_reward/std": 0.46836432814598083, + "rewards/thinking_verbosity_reward/mean": -0.08640043437480927, + "rewards/thinking_verbosity_reward/std": 0.003943993244320154, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.828125, + "epoch": 0.6043083900226758, + "grad_norm": 2.4811768531799316, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 55995021.0, + "rewards/KL_reward/mean": -9.870138168334961, + "rewards/KL_reward/std": 2.0301644802093506, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.3425048589706421, + "rewards/angle_reward/std": 0.5096268057823181, + "rewards/thinking_verbosity_reward/mean": -0.08377633988857269, + "rewards/thinking_verbosity_reward/std": 0.00722804618999362, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 29.84375, + "epoch": 0.6054421768707483, + "grad_norm": 2.358689546585083, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56031185.0, + "rewards/KL_reward/mean": -11.116844177246094, + "rewards/KL_reward/std": 4.051690578460693, + "rewards/accuracy_reward/mean": 0.015625, + "rewards/accuracy_reward/std": 0.12450689822435379, + "rewards/angle_reward/mean": -0.005799375474452972, + "rewards/angle_reward/std": 0.6632209420204163, + "rewards/thinking_verbosity_reward/mean": -0.10786447674036026, + "rewards/thinking_verbosity_reward/std": 0.24688217043876648, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.40625, + "epoch": 0.6065759637188208, + "grad_norm": 3.3132803440093994, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56063637.0, + "rewards/KL_reward/mean": -12.70104694366455, + "rewards/KL_reward/std": 3.0239720344543457, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.0883883461356163, + "rewards/angle_reward/mean": 0.03127017617225647, + "rewards/angle_reward/std": 0.7112759947776794, + "rewards/thinking_verbosity_reward/mean": -0.07684475183486938, + "rewards/thinking_verbosity_reward/std": 0.01144831720739603, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.59375, + "epoch": 0.6077097505668935, + "grad_norm": 1.9887434244155884, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56095833.0, + "rewards/KL_reward/mean": -13.586993217468262, + "rewards/KL_reward/std": 3.1474719047546387, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.14363107085227966, + "rewards/angle_reward/std": 0.5976290106773376, + "rewards/thinking_verbosity_reward/mean": -0.07788439095020294, + "rewards/thinking_verbosity_reward/std": 0.020535118877887726, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.1328125, + "epoch": 0.608843537414966, + "grad_norm": 1.1461775302886963, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 56127986.0, + "rewards/KL_reward/mean": -15.27037239074707, + "rewards/KL_reward/std": 2.396375894546509, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.22097085416316986, + "rewards/angle_reward/std": 0.4138355255126953, + "rewards/thinking_verbosity_reward/mean": -0.07201752811670303, + "rewards/thinking_verbosity_reward/std": 0.012897231616079807, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.0234375, + "epoch": 0.6099773242630385, + "grad_norm": 2.3434665203094482, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56160565.0, + "rewards/KL_reward/mean": -14.925134658813477, + "rewards/KL_reward/std": 1.6955004930496216, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.0883883461356163, + "rewards/angle_reward/mean": 0.006519727408885956, + "rewards/angle_reward/std": 0.4306424856185913, + "rewards/thinking_verbosity_reward/mean": -0.0707487016916275, + "rewards/thinking_verbosity_reward/std": 0.00732355285435915, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 26.5390625, + "epoch": 0.6111111111111112, + "grad_norm": 4.019361972808838, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 56195946.0, + "rewards/KL_reward/mean": -14.851444244384766, + "rewards/KL_reward/std": 2.7126476764678955, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.022097084671258926, + "rewards/angle_reward/std": 0.39621734619140625, + "rewards/thinking_verbosity_reward/mean": -0.08837562799453735, + "rewards/thinking_verbosity_reward/std": 0.23769377171993256, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 8.140625, + "epoch": 0.6122448979591837, + "grad_norm": 6.1977362632751465, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56228708.0, + "rewards/KL_reward/mean": -16.46318817138672, + "rewards/KL_reward/std": 3.266050100326538, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0883883386850357, + "rewards/angle_reward/std": 0.5541539788246155, + "rewards/thinking_verbosity_reward/mean": -0.0723920613527298, + "rewards/thinking_verbosity_reward/std": 0.11273974925279617, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 51.421875, + "epoch": 0.6133786848072562, + "grad_norm": 6.258033752441406, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56266698.0, + "rewards/KL_reward/mean": -14.750425338745117, + "rewards/KL_reward/std": 4.972354888916016, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.09943688660860062, + "rewards/angle_reward/std": 0.6066194772720337, + "rewards/thinking_verbosity_reward/mean": -0.11021846532821655, + "rewards/thinking_verbosity_reward/std": 0.33888471126556396, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.9609375, + "epoch": 0.6145124716553289, + "grad_norm": 2.545353889465332, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56299501.0, + "rewards/KL_reward/mean": -13.780776023864746, + "rewards/KL_reward/std": 4.725129127502441, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.15467959642410278, + "rewards/angle_reward/std": 0.5092645287513733, + "rewards/thinking_verbosity_reward/mean": -0.06911271065473557, + "rewards/thinking_verbosity_reward/std": 0.07187584787607193, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 45.65625, + "epoch": 0.6156462585034014, + "grad_norm": 8.453726768493652, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 56337401.0, + "rewards/KL_reward/mean": -15.310544967651367, + "rewards/KL_reward/std": 4.958581447601318, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.055242717266082764, + "rewards/angle_reward/std": 0.5859870910644531, + "rewards/thinking_verbosity_reward/mean": -0.11464644968509674, + "rewards/thinking_verbosity_reward/std": 0.31510642170906067, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 2.7890625, + "epoch": 0.6167800453514739, + "grad_norm": 4.095304489135742, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56370014.0, + "rewards/KL_reward/mean": -19.383827209472656, + "rewards/KL_reward/std": 6.920942783355713, + "rewards/accuracy_reward/mean": 0.0234375, + "rewards/accuracy_reward/std": 0.15188287198543549, + "rewards/angle_reward/mean": 0.09018324315547943, + "rewards/angle_reward/std": 0.4330177307128906, + "rewards/thinking_verbosity_reward/mean": -0.0450855977833271, + "rewards/thinking_verbosity_reward/std": 0.049589890986680984, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.6171875, + "epoch": 0.6179138321995464, + "grad_norm": 18.613027572631836, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 56402205.0, + "rewards/KL_reward/mean": -22.613176345825195, + "rewards/KL_reward/std": 8.32883071899414, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.07733979821205139, + "rewards/angle_reward/std": 0.46308088302612305, + "rewards/thinking_verbosity_reward/mean": -0.024793803691864014, + "rewards/thinking_verbosity_reward/std": 0.03058668039739132, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.2109375, + "epoch": 0.6190476190476191, + "grad_norm": 6.894687652587891, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 56433896.0, + "rewards/KL_reward/mean": -26.91053009033203, + "rewards/KL_reward/std": 7.242550849914551, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.055242717266082764, + "rewards/angle_reward/std": 0.3505830466747284, + "rewards/thinking_verbosity_reward/mean": -0.010546875186264515, + "rewards/thinking_verbosity_reward/std": 0.020478859543800354, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.046875, + "epoch": 0.6201814058956916, + "grad_norm": 5.023636341094971, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 56464654.0, + "rewards/KL_reward/mean": -32.346656799316406, + "rewards/KL_reward/std": 3.809330940246582, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.06629125773906708, + "rewards/angle_reward/std": 0.24199791252613068, + "rewards/thinking_verbosity_reward/mean": -0.0013531646691262722, + "rewards/thinking_verbosity_reward/std": 0.010782613418996334, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6213151927437641, + "grad_norm": 0.4634699821472168, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 56496894.0, + "rewards/KL_reward/mean": -33.13923645019531, + "rewards/KL_reward/std": 1.1342289447784424, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6224489795918368, + "grad_norm": 0.7207654714584351, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 56528990.0, + "rewards/KL_reward/mean": -33.207855224609375, + "rewards/KL_reward/std": 1.6682014465332031, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.6235827664399093, + "grad_norm": 1.3963048458099365, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 56560783.0, + "rewards/KL_reward/mean": -33.020751953125, + "rewards/KL_reward/std": 1.8156154155731201, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6247165532879818, + "grad_norm": 0.6619357466697693, + "learning_rate": 5e-05, + "loss": -0.0022, + "num_tokens": 56592831.0, + "rewards/KL_reward/mean": -32.45671463012695, + "rewards/KL_reward/std": 4.836127758026123, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6258503401360545, + "grad_norm": 0.4096638858318329, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_tokens": 56624415.0, + "rewards/KL_reward/mean": -33.64544677734375, + "rewards/KL_reward/std": 0.7837253212928772, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.626984126984127, + "grad_norm": 0.035003989934921265, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 56656871.0, + "rewards/KL_reward/mean": -33.58020782470703, + "rewards/KL_reward/std": 1.1956831216812134, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6281179138321995, + "grad_norm": 0.002811228157952428, + "learning_rate": 5e-05, + "loss": -0.003, + "num_tokens": 56688807.0, + "rewards/KL_reward/mean": -33.92033386230469, + "rewards/KL_reward/std": 0.7668801546096802, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6292517006802721, + "grad_norm": 0.0012724545085802674, + "learning_rate": 5e-05, + "loss": -0.0015, + "num_tokens": 56720983.0, + "rewards/KL_reward/mean": -33.561241149902344, + "rewards/KL_reward/std": 0.7718645334243774, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6303854875283447, + "grad_norm": 0.002310891402885318, + "learning_rate": 5e-05, + "loss": -0.0009, + "num_tokens": 56753151.0, + "rewards/KL_reward/mean": -33.38077163696289, + "rewards/KL_reward/std": 1.3701101541519165, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6315192743764172, + "grad_norm": 0.23985038697719574, + "learning_rate": 5e-05, + "loss": -0.0017, + "num_tokens": 56785231.0, + "rewards/KL_reward/mean": -33.302833557128906, + "rewards/KL_reward/std": 1.7017228603363037, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6326530612244898, + "grad_norm": 0.00048049696488305926, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_tokens": 56817119.0, + "rewards/KL_reward/mean": -33.43657684326172, + "rewards/KL_reward/std": 0.8716328144073486, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6337868480725624, + "grad_norm": 0.6322202086448669, + "learning_rate": 5e-05, + "loss": -0.0021, + "num_tokens": 56848839.0, + "rewards/KL_reward/mean": -32.780845642089844, + "rewards/KL_reward/std": 2.6573376655578613, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6349206349206349, + "grad_norm": 0.5810360908508301, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_tokens": 56881199.0, + "rewards/KL_reward/mean": -32.47676086425781, + "rewards/KL_reward/std": 2.3711776733398438, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6360544217687075, + "grad_norm": 0.40197378396987915, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 56913071.0, + "rewards/KL_reward/mean": -32.77272415161133, + "rewards/KL_reward/std": 3.1013097763061523, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.63718820861678, + "grad_norm": 0.0005623187753371894, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_tokens": 56945607.0, + "rewards/KL_reward/mean": -33.655338287353516, + "rewards/KL_reward/std": 1.1464321613311768, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6383219954648526, + "grad_norm": 0.0015330812893807888, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_tokens": 56977527.0, + "rewards/KL_reward/mean": -32.998355865478516, + "rewards/KL_reward/std": 1.5066907405853271, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6394557823129252, + "grad_norm": 0.33337169885635376, + "learning_rate": 5e-05, + "loss": -0.0015, + "num_tokens": 57009295.0, + "rewards/KL_reward/mean": -33.21354675292969, + "rewards/KL_reward/std": 1.2528866529464722, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6405895691609977, + "grad_norm": 0.0013399942545220256, + "learning_rate": 5e-05, + "loss": 0.002, + "num_tokens": 57041639.0, + "rewards/KL_reward/mean": -33.86585998535156, + "rewards/KL_reward/std": 1.0116124153137207, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 565 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6417233560090703, + "grad_norm": 0.05720170587301254, + "learning_rate": 5e-05, + "loss": -0.0012, + "num_tokens": 57074119.0, + "rewards/KL_reward/mean": -32.12410354614258, + "rewards/KL_reward/std": 1.4921815395355225, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 566 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6428571428571429, + "grad_norm": 0.0004908728296868503, + "learning_rate": 5e-05, + "loss": -0.0025, + "num_tokens": 57106527.0, + "rewards/KL_reward/mean": -33.23358154296875, + "rewards/KL_reward/std": 0.9941945672035217, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 567 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6439909297052154, + "grad_norm": 0.0002522620197851211, + "learning_rate": 5e-05, + "loss": -0.0009, + "num_tokens": 57138207.0, + "rewards/KL_reward/mean": -33.54637145996094, + "rewards/KL_reward/std": 1.3176603317260742, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 568 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.645124716553288, + "grad_norm": 0.0027960508596152067, + "learning_rate": 5e-05, + "loss": -0.0043, + "num_tokens": 57170455.0, + "rewards/KL_reward/mean": -33.99753952026367, + "rewards/KL_reward/std": 1.4594237804412842, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 569 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6462585034013606, + "grad_norm": 0.0008815639885142446, + "learning_rate": 5e-05, + "loss": -0.0007, + "num_tokens": 57202839.0, + "rewards/KL_reward/mean": -33.43678665161133, + "rewards/KL_reward/std": 1.100521445274353, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 570 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6473922902494331, + "grad_norm": 0.005347798112779856, + "learning_rate": 5e-05, + "loss": 0.001, + "num_tokens": 57235287.0, + "rewards/KL_reward/mean": -32.888587951660156, + "rewards/KL_reward/std": 2.2769651412963867, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 571 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6485260770975056, + "grad_norm": 0.0006010847282595932, + "learning_rate": 5e-05, + "loss": -0.0047, + "num_tokens": 57266391.0, + "rewards/KL_reward/mean": -33.827232360839844, + "rewards/KL_reward/std": 1.2292903661727905, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 572 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6496598639455783, + "grad_norm": 0.0024182789493352175, + "learning_rate": 5e-05, + "loss": 0.0044, + "num_tokens": 57298743.0, + "rewards/KL_reward/mean": -33.92839431762695, + "rewards/KL_reward/std": 1.1869778633117676, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 573 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6507936507936508, + "grad_norm": 0.0004037956241518259, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_tokens": 57330807.0, + "rewards/KL_reward/mean": -33.319618225097656, + "rewards/KL_reward/std": 1.1940268278121948, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 574 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6519274376417233, + "grad_norm": 0.0005184471956454217, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_tokens": 57363559.0, + "rewards/KL_reward/mean": -33.24909973144531, + "rewards/KL_reward/std": 0.9304895401000977, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 575 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6530612244897959, + "grad_norm": 0.00027723200037144125, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_tokens": 57394887.0, + "rewards/KL_reward/mean": -33.85890197753906, + "rewards/KL_reward/std": 1.0348025560379028, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 576 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6541950113378685, + "grad_norm": 0.01639043353497982, + "learning_rate": 5e-05, + "loss": 0.002, + "num_tokens": 57427231.0, + "rewards/KL_reward/mean": -33.93389892578125, + "rewards/KL_reward/std": 0.9749399423599243, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 577 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.015625, + "epoch": 0.655328798185941, + "grad_norm": 0.1564824879169464, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 57459729.0, + "rewards/KL_reward/mean": -33.030155181884766, + "rewards/KL_reward/std": 1.307621717453003, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 578 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6564625850340136, + "grad_norm": 0.00019246863666921854, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 57491377.0, + "rewards/KL_reward/mean": -33.92146301269531, + "rewards/KL_reward/std": 1.330231785774231, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 579 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6575963718820862, + "grad_norm": 0.23386387526988983, + "learning_rate": 5e-05, + "loss": -0.0012, + "num_tokens": 57523513.0, + "rewards/KL_reward/mean": -33.39048767089844, + "rewards/KL_reward/std": 2.2050530910491943, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 580 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6587301587301587, + "grad_norm": 0.0003712023317348212, + "learning_rate": 5e-05, + "loss": -0.0015, + "num_tokens": 57555297.0, + "rewards/KL_reward/mean": -33.397918701171875, + "rewards/KL_reward/std": 0.8269470930099487, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 581 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6598639455782312, + "grad_norm": 0.0018605771474540234, + "learning_rate": 5e-05, + "loss": 0.0047, + "num_tokens": 57587193.0, + "rewards/KL_reward/mean": -33.85803985595703, + "rewards/KL_reward/std": 1.7370949983596802, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 582 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6609977324263039, + "grad_norm": 0.0011312238639220595, + "learning_rate": 5e-05, + "loss": -0.0051, + "num_tokens": 57619657.0, + "rewards/KL_reward/mean": -33.73335266113281, + "rewards/KL_reward/std": 1.1629188060760498, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 583 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6621315192743764, + "grad_norm": 0.00016034345026127994, + "learning_rate": 5e-05, + "loss": -0.0008, + "num_tokens": 57651625.0, + "rewards/KL_reward/mean": -33.82770919799805, + "rewards/KL_reward/std": 0.7721322178840637, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 584 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6632653061224489, + "grad_norm": 0.00107968517113477, + "learning_rate": 5e-05, + "loss": -0.0011, + "num_tokens": 57684313.0, + "rewards/KL_reward/mean": -33.06117248535156, + "rewards/KL_reward/std": 0.8302332162857056, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 585 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6643990929705216, + "grad_norm": 0.001078012166544795, + "learning_rate": 5e-05, + "loss": 0.0025, + "num_tokens": 57716849.0, + "rewards/KL_reward/mean": -33.73332214355469, + "rewards/KL_reward/std": 0.9741858839988708, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 586 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6655328798185941, + "grad_norm": 0.004448353312909603, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 57749345.0, + "rewards/KL_reward/mean": -33.36539840698242, + "rewards/KL_reward/std": 1.5083750486373901, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 587 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6666666666666666, + "grad_norm": 0.5591500997543335, + "learning_rate": 5e-05, + "loss": -0.007, + "num_tokens": 57781753.0, + "rewards/KL_reward/mean": -32.74077606201172, + "rewards/KL_reward/std": 1.90312659740448, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 588 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6678004535147393, + "grad_norm": 0.00034644355764612556, + "learning_rate": 5e-05, + "loss": -0.0046, + "num_tokens": 57813553.0, + "rewards/KL_reward/mean": -33.96821594238281, + "rewards/KL_reward/std": 0.9595673084259033, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 589 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6689342403628118, + "grad_norm": 0.5171715617179871, + "learning_rate": 5e-05, + "loss": -0.0045, + "num_tokens": 57845481.0, + "rewards/KL_reward/mean": -32.79689407348633, + "rewards/KL_reward/std": 3.483654737472534, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 590 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6700680272108843, + "grad_norm": 0.09763412922620773, + "learning_rate": 5e-05, + "loss": -0.0057, + "num_tokens": 57877929.0, + "rewards/KL_reward/mean": -33.133235931396484, + "rewards/KL_reward/std": 1.5220880508422852, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 591 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.671201814058957, + "grad_norm": 0.0015285967383533716, + "learning_rate": 5e-05, + "loss": -0.0006, + "num_tokens": 57910425.0, + "rewards/KL_reward/mean": -33.20171356201172, + "rewards/KL_reward/std": 1.681617259979248, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 592 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6723356009070295, + "grad_norm": 0.00030617736047133803, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_tokens": 57942897.0, + "rewards/KL_reward/mean": -33.10102462768555, + "rewards/KL_reward/std": 1.0969640016555786, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 593 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.673469387755102, + "grad_norm": 0.31258565187454224, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 57975081.0, + "rewards/KL_reward/mean": -34.14094543457031, + "rewards/KL_reward/std": 1.3639986515045166, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 594 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6746031746031746, + "grad_norm": 0.0006680377409793437, + "learning_rate": 5e-05, + "loss": 0.002, + "num_tokens": 58007609.0, + "rewards/KL_reward/mean": -33.03845977783203, + "rewards/KL_reward/std": 1.2797842025756836, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 595 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6757369614512472, + "grad_norm": 0.0017703570192679763, + "learning_rate": 5e-05, + "loss": -0.0019, + "num_tokens": 58039529.0, + "rewards/KL_reward/mean": -33.55364990234375, + "rewards/KL_reward/std": 0.9732753038406372, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 596 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6768707482993197, + "grad_norm": 0.0001716313709039241, + "learning_rate": 5e-05, + "loss": -0.002, + "num_tokens": 58070553.0, + "rewards/KL_reward/mean": -33.6715087890625, + "rewards/KL_reward/std": 0.7248656153678894, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 597 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6780045351473923, + "grad_norm": 0.0002292812569066882, + "learning_rate": 5e-05, + "loss": 0.0042, + "num_tokens": 58102009.0, + "rewards/KL_reward/mean": -33.2886962890625, + "rewards/KL_reward/std": 1.0974416732788086, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 598 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6791383219954649, + "grad_norm": 0.0003618478949647397, + "learning_rate": 5e-05, + "loss": 0.0041, + "num_tokens": 58134009.0, + "rewards/KL_reward/mean": -34.09300231933594, + "rewards/KL_reward/std": 0.9552999138832092, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 599 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6802721088435374, + "grad_norm": 0.00029338515014387667, + "learning_rate": 5e-05, + "loss": 0.0029, + "num_tokens": 58165673.0, + "rewards/KL_reward/mean": -33.718257904052734, + "rewards/KL_reward/std": 1.2162995338439941, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 600 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.68140589569161, + "grad_norm": 0.002398415934294462, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_tokens": 58196625.0, + "rewards/KL_reward/mean": -33.8197135925293, + "rewards/KL_reward/std": 1.4261441230773926, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 601 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6825396825396826, + "grad_norm": 0.0002242231712443754, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_tokens": 58228697.0, + "rewards/KL_reward/mean": -33.507415771484375, + "rewards/KL_reward/std": 1.07442045211792, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 602 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6836734693877551, + "grad_norm": 0.0006778687820769846, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_tokens": 58260633.0, + "rewards/KL_reward/mean": -33.163368225097656, + "rewards/KL_reward/std": 0.9972267150878906, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 603 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0390625, + "epoch": 0.6848072562358276, + "grad_norm": 0.5446889400482178, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 58292790.0, + "rewards/KL_reward/mean": -33.51383590698242, + "rewards/KL_reward/std": 2.0158700942993164, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0006765823345631361, + "rewards/thinking_verbosity_reward/std": 0.00765465572476387, + "step": 604 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6859410430839002, + "grad_norm": 0.00029776192968711257, + "learning_rate": 5e-05, + "loss": -0.0016, + "num_tokens": 58324862.0, + "rewards/KL_reward/mean": -33.280799865722656, + "rewards/KL_reward/std": 0.9887773990631104, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 605 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6870748299319728, + "grad_norm": 0.0018429755000397563, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 58356718.0, + "rewards/KL_reward/mean": -33.600765228271484, + "rewards/KL_reward/std": 1.753105640411377, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 606 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6882086167800453, + "grad_norm": 0.0004957416094839573, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 58388950.0, + "rewards/KL_reward/mean": -33.10886001586914, + "rewards/KL_reward/std": 0.9649978280067444, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 607 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6893424036281179, + "grad_norm": 0.0045425486750900745, + "learning_rate": 5e-05, + "loss": 0.0032, + "num_tokens": 58421038.0, + "rewards/KL_reward/mean": -32.95924377441406, + "rewards/KL_reward/std": 1.7486824989318848, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 608 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.015625, + "epoch": 0.6904761904761905, + "grad_norm": 1.0173194408416748, + "learning_rate": 5e-05, + "loss": -0.0048, + "num_tokens": 58453096.0, + "rewards/KL_reward/mean": -33.599456787109375, + "rewards/KL_reward/std": 1.6784018278121948, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 609 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.691609977324263, + "grad_norm": 0.0007395737338811159, + "learning_rate": 5e-05, + "loss": 0.0055, + "num_tokens": 58485088.0, + "rewards/KL_reward/mean": -33.9056282043457, + "rewards/KL_reward/std": 0.8960281014442444, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 610 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6927437641723356, + "grad_norm": 0.0005152480443939567, + "learning_rate": 5e-05, + "loss": -0.0016, + "num_tokens": 58517096.0, + "rewards/KL_reward/mean": -33.851016998291016, + "rewards/KL_reward/std": 1.5645488500595093, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 611 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6938775510204082, + "grad_norm": 0.00024765508715063334, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_tokens": 58549704.0, + "rewards/KL_reward/mean": -33.67916488647461, + "rewards/KL_reward/std": 1.1806282997131348, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 612 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6950113378684807, + "grad_norm": 0.06450813263654709, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 58581488.0, + "rewards/KL_reward/mean": -33.84139633178711, + "rewards/KL_reward/std": 2.0141618251800537, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 613 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6961451247165533, + "grad_norm": 0.9488609433174133, + "learning_rate": 5e-05, + "loss": -0.003, + "num_tokens": 58613456.0, + "rewards/KL_reward/mean": -32.28491973876953, + "rewards/KL_reward/std": 2.4036858081817627, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 614 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6972789115646258, + "grad_norm": 0.0016067641554400325, + "learning_rate": 5e-05, + "loss": -0.0029, + "num_tokens": 58645432.0, + "rewards/KL_reward/mean": -33.97580337524414, + "rewards/KL_reward/std": 1.25538170337677, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 615 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.6984126984126984, + "grad_norm": 0.0003554086433723569, + "learning_rate": 5e-05, + "loss": -0.0027, + "num_tokens": 58677704.0, + "rewards/KL_reward/mean": -33.46821975708008, + "rewards/KL_reward/std": 2.084940195083618, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 616 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.699546485260771, + "grad_norm": 0.0007773024262860417, + "learning_rate": 5e-05, + "loss": 0.0036, + "num_tokens": 58709696.0, + "rewards/KL_reward/mean": -33.11626052856445, + "rewards/KL_reward/std": 1.1806670427322388, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 617 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7006802721088435, + "grad_norm": 0.07335913926362991, + "learning_rate": 5e-05, + "loss": 0.0045, + "num_tokens": 58742096.0, + "rewards/KL_reward/mean": -33.329498291015625, + "rewards/KL_reward/std": 1.485001802444458, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 618 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7018140589569161, + "grad_norm": 0.00014804053353145719, + "learning_rate": 5e-05, + "loss": 0.0061, + "num_tokens": 58774032.0, + "rewards/KL_reward/mean": -33.54657745361328, + "rewards/KL_reward/std": 1.0319091081619263, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 619 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7029478458049887, + "grad_norm": 0.00015110177628230304, + "learning_rate": 5e-05, + "loss": -0.0005, + "num_tokens": 58806544.0, + "rewards/KL_reward/mean": -33.54651641845703, + "rewards/KL_reward/std": 0.8972129225730896, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 620 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7040816326530612, + "grad_norm": 0.00027474554372020066, + "learning_rate": 5e-05, + "loss": -0.0029, + "num_tokens": 58838840.0, + "rewards/KL_reward/mean": -33.46837615966797, + "rewards/KL_reward/std": 0.8508355617523193, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 621 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7052154195011338, + "grad_norm": 0.00023305356444325298, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_tokens": 58870776.0, + "rewards/KL_reward/mean": -33.83546447753906, + "rewards/KL_reward/std": 1.3228144645690918, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 622 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7063492063492064, + "grad_norm": 6.870734796393663e-05, + "learning_rate": 5e-05, + "loss": -0.0007, + "num_tokens": 58901968.0, + "rewards/KL_reward/mean": -33.718536376953125, + "rewards/KL_reward/std": 0.7522987127304077, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 623 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7074829931972789, + "grad_norm": 0.0712951123714447, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 58934136.0, + "rewards/KL_reward/mean": -33.2022819519043, + "rewards/KL_reward/std": 1.5445148944854736, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 624 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7086167800453514, + "grad_norm": 0.5388066172599792, + "learning_rate": 5e-05, + "loss": 0.0049, + "num_tokens": 58965536.0, + "rewards/KL_reward/mean": -32.57041931152344, + "rewards/KL_reward/std": 4.242175579071045, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 625 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7097505668934241, + "grad_norm": 0.0003903552424162626, + "learning_rate": 5e-05, + "loss": 0.011, + "num_tokens": 58998136.0, + "rewards/KL_reward/mean": -33.62450408935547, + "rewards/KL_reward/std": 1.1225509643554688, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 626 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7108843537414966, + "grad_norm": 0.5541359782218933, + "learning_rate": 5e-05, + "loss": 0.0062, + "num_tokens": 59030352.0, + "rewards/KL_reward/mean": -32.1502799987793, + "rewards/KL_reward/std": 3.336327314376831, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 627 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7120181405895691, + "grad_norm": 0.001875443966127932, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 59062008.0, + "rewards/KL_reward/mean": -33.85092544555664, + "rewards/KL_reward/std": 1.4876642227172852, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 628 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7131519274376418, + "grad_norm": 0.00022843039187137038, + "learning_rate": 5e-05, + "loss": -0.001, + "num_tokens": 59094136.0, + "rewards/KL_reward/mean": -33.3590087890625, + "rewards/KL_reward/std": 1.20937979221344, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 629 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7142857142857143, + "grad_norm": 0.00017198668501805514, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_tokens": 59126424.0, + "rewards/KL_reward/mean": -33.39804458618164, + "rewards/KL_reward/std": 0.9631538987159729, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 630 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7154195011337868, + "grad_norm": 0.058965396136045456, + "learning_rate": 5e-05, + "loss": -0.0024, + "num_tokens": 59158968.0, + "rewards/KL_reward/mean": -33.300758361816406, + "rewards/KL_reward/std": 1.4007177352905273, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 631 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7165532879818595, + "grad_norm": 0.0004953857278451324, + "learning_rate": 5e-05, + "loss": 0.0075, + "num_tokens": 59190792.0, + "rewards/KL_reward/mean": -33.7652473449707, + "rewards/KL_reward/std": 1.009839653968811, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 632 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.717687074829932, + "grad_norm": 0.0001303389435634017, + "learning_rate": 5e-05, + "loss": 0.0039, + "num_tokens": 59222240.0, + "rewards/KL_reward/mean": -33.73405838012695, + "rewards/KL_reward/std": 1.7354185581207275, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 633 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7188208616780045, + "grad_norm": 0.3652340769767761, + "learning_rate": 5e-05, + "loss": 0.0045, + "num_tokens": 59254552.0, + "rewards/KL_reward/mean": -32.576416015625, + "rewards/KL_reward/std": 2.7574236392974854, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 634 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.719954648526077, + "grad_norm": 0.0008426376734860241, + "learning_rate": 5e-05, + "loss": 0.004, + "num_tokens": 59287064.0, + "rewards/KL_reward/mean": -33.13212203979492, + "rewards/KL_reward/std": 1.009618878364563, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 635 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7210884353741497, + "grad_norm": 0.00012571057595778257, + "learning_rate": 5e-05, + "loss": -0.0112, + "num_tokens": 59318632.0, + "rewards/KL_reward/mean": -33.55448913574219, + "rewards/KL_reward/std": 0.9037834405899048, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 636 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7222222222222222, + "grad_norm": 0.0003126139345113188, + "learning_rate": 5e-05, + "loss": -0.0126, + "num_tokens": 59350408.0, + "rewards/KL_reward/mean": -33.88240051269531, + "rewards/KL_reward/std": 1.4632179737091064, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 637 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7233560090702947, + "grad_norm": 0.0002778090420179069, + "learning_rate": 5e-05, + "loss": -0.0102, + "num_tokens": 59382320.0, + "rewards/KL_reward/mean": -33.33558654785156, + "rewards/KL_reward/std": 1.2727230787277222, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 638 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7244897959183674, + "grad_norm": 0.00049192103324458, + "learning_rate": 5e-05, + "loss": 0.0101, + "num_tokens": 59414512.0, + "rewards/KL_reward/mean": -33.272857666015625, + "rewards/KL_reward/std": 1.0962445735931396, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 639 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7256235827664399, + "grad_norm": 0.00018673698650673032, + "learning_rate": 5e-05, + "loss": 0.0058, + "num_tokens": 59445912.0, + "rewards/KL_reward/mean": -33.45282745361328, + "rewards/KL_reward/std": 1.211887001991272, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 640 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7267573696145124, + "grad_norm": 0.995429277420044, + "learning_rate": 5e-05, + "loss": 0.0034, + "num_tokens": 59477952.0, + "rewards/KL_reward/mean": -31.681346893310547, + "rewards/KL_reward/std": 4.268650531768799, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 641 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7278911564625851, + "grad_norm": 0.19380541145801544, + "learning_rate": 5e-05, + "loss": -0.0016, + "num_tokens": 59509904.0, + "rewards/KL_reward/mean": -33.75501251220703, + "rewards/KL_reward/std": 1.0362346172332764, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 642 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7290249433106576, + "grad_norm": 0.6822595000267029, + "learning_rate": 5e-05, + "loss": 0.0021, + "num_tokens": 59541992.0, + "rewards/KL_reward/mean": -31.853694915771484, + "rewards/KL_reward/std": 4.532584190368652, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 643 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7301587301587301, + "grad_norm": 9.342659905087203e-05, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_tokens": 59574264.0, + "rewards/KL_reward/mean": -33.14033508300781, + "rewards/KL_reward/std": 1.1645630598068237, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 644 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7312925170068028, + "grad_norm": 0.00038804521318525076, + "learning_rate": 5e-05, + "loss": 0.0033, + "num_tokens": 59606184.0, + "rewards/KL_reward/mean": -33.734004974365234, + "rewards/KL_reward/std": 1.066764235496521, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 645 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7324263038548753, + "grad_norm": 0.22279416024684906, + "learning_rate": 5e-05, + "loss": -0.0084, + "num_tokens": 59637624.0, + "rewards/KL_reward/mean": -32.14263916015625, + "rewards/KL_reward/std": 6.758326053619385, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 646 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7335600907029478, + "grad_norm": 0.000317032216116786, + "learning_rate": 5e-05, + "loss": -0.0005, + "num_tokens": 59670168.0, + "rewards/KL_reward/mean": -33.10893249511719, + "rewards/KL_reward/std": 0.9557627439498901, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 647 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7346938775510204, + "grad_norm": 0.0006659679929725826, + "learning_rate": 5e-05, + "loss": -0.0031, + "num_tokens": 59701792.0, + "rewards/KL_reward/mean": -34.12440490722656, + "rewards/KL_reward/std": 1.477248191833496, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 648 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.735827664399093, + "grad_norm": 0.0002181259769713506, + "learning_rate": 5e-05, + "loss": 0.0064, + "num_tokens": 59734440.0, + "rewards/KL_reward/mean": -33.26518630981445, + "rewards/KL_reward/std": 0.6842576265335083, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 649 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7369614512471655, + "grad_norm": 0.00024293440219480544, + "learning_rate": 5e-05, + "loss": -0.0095, + "num_tokens": 59766600.0, + "rewards/KL_reward/mean": -33.562110900878906, + "rewards/KL_reward/std": 1.3676056861877441, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 650 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7380952380952381, + "grad_norm": 0.0003284573322162032, + "learning_rate": 5e-05, + "loss": -0.0108, + "num_tokens": 59798256.0, + "rewards/KL_reward/mean": -33.58565902709961, + "rewards/KL_reward/std": 1.092898964881897, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 651 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7392290249433107, + "grad_norm": 0.000712559325620532, + "learning_rate": 5e-05, + "loss": -0.0031, + "num_tokens": 59829984.0, + "rewards/KL_reward/mean": -33.616432189941406, + "rewards/KL_reward/std": 1.291390299797058, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 652 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7403628117913832, + "grad_norm": 0.0013158658985048532, + "learning_rate": 5e-05, + "loss": -0.0007, + "num_tokens": 59862560.0, + "rewards/KL_reward/mean": -33.030364990234375, + "rewards/KL_reward/std": 0.949847400188446, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 653 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.7414965986394558, + "grad_norm": 0.17137093842029572, + "learning_rate": 5e-05, + "loss": -0.0017, + "num_tokens": 59893937.0, + "rewards/KL_reward/mean": -33.508331298828125, + "rewards/KL_reward/std": 1.0768557786941528, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 654 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7426303854875284, + "grad_norm": 0.11959031224250793, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 59925505.0, + "rewards/KL_reward/mean": -33.461448669433594, + "rewards/KL_reward/std": 1.6048932075500488, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 655 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7437641723356009, + "grad_norm": 0.0006741550751030445, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_tokens": 59957553.0, + "rewards/KL_reward/mean": -33.40549850463867, + "rewards/KL_reward/std": 1.3017975091934204, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 656 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7448979591836735, + "grad_norm": 0.05803260952234268, + "learning_rate": 5e-05, + "loss": 0.0039, + "num_tokens": 59989185.0, + "rewards/KL_reward/mean": -33.339691162109375, + "rewards/KL_reward/std": 1.560107707977295, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 657 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.746031746031746, + "grad_norm": 0.00023864081595093012, + "learning_rate": 5e-05, + "loss": 0.0048, + "num_tokens": 60021105.0, + "rewards/KL_reward/mean": -33.468257904052734, + "rewards/KL_reward/std": 1.1290925741195679, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 658 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7471655328798186, + "grad_norm": 0.2241455465555191, + "learning_rate": 5e-05, + "loss": -0.0011, + "num_tokens": 60052873.0, + "rewards/KL_reward/mean": -33.317718505859375, + "rewards/KL_reward/std": 1.3931663036346436, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 659 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7482993197278912, + "grad_norm": 0.0003466380585450679, + "learning_rate": 5e-05, + "loss": -0.0013, + "num_tokens": 60084865.0, + "rewards/KL_reward/mean": -33.85884475708008, + "rewards/KL_reward/std": 0.9919801950454712, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 660 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7494331065759637, + "grad_norm": 0.008189519867300987, + "learning_rate": 5e-05, + "loss": -0.003, + "num_tokens": 60116865.0, + "rewards/KL_reward/mean": -33.21611785888672, + "rewards/KL_reward/std": 1.6068731546401978, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 661 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7505668934240363, + "grad_norm": 0.0010099131613969803, + "learning_rate": 5e-05, + "loss": -0.0017, + "num_tokens": 60148969.0, + "rewards/KL_reward/mean": -33.85078430175781, + "rewards/KL_reward/std": 1.8423516750335693, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 662 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7517006802721088, + "grad_norm": 0.0002411614841548726, + "learning_rate": 5e-05, + "loss": -0.0009, + "num_tokens": 60181785.0, + "rewards/KL_reward/mean": -34.116703033447266, + "rewards/KL_reward/std": 1.4885939359664917, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 663 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7528344671201814, + "grad_norm": 0.0003497452998999506, + "learning_rate": 5e-05, + "loss": 0.0062, + "num_tokens": 60213737.0, + "rewards/KL_reward/mean": -33.90576171875, + "rewards/KL_reward/std": 1.1969181299209595, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 664 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.753968253968254, + "grad_norm": 0.0003262519312556833, + "learning_rate": 5e-05, + "loss": 0.001, + "num_tokens": 60245929.0, + "rewards/KL_reward/mean": -33.390018463134766, + "rewards/KL_reward/std": 0.8370320200920105, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 665 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7551020408163265, + "grad_norm": 0.0005763856461271644, + "learning_rate": 5e-05, + "loss": -0.0021, + "num_tokens": 60277121.0, + "rewards/KL_reward/mean": -34.03056716918945, + "rewards/KL_reward/std": 1.0610737800598145, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 666 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7562358276643991, + "grad_norm": 0.0036802797112613916, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_tokens": 60309265.0, + "rewards/KL_reward/mean": -33.4349250793457, + "rewards/KL_reward/std": 1.7730191946029663, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 667 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7573696145124716, + "grad_norm": 0.0006001257570460439, + "learning_rate": 5e-05, + "loss": -0.0031, + "num_tokens": 60340953.0, + "rewards/KL_reward/mean": -33.36646270751953, + "rewards/KL_reward/std": 1.1170799732208252, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 668 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.7585034013605442, + "grad_norm": 0.25414907932281494, + "learning_rate": 5e-05, + "loss": -0.0011, + "num_tokens": 60373370.0, + "rewards/KL_reward/mean": -33.295066833496094, + "rewards/KL_reward/std": 2.0293033123016357, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 669 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7596371882086168, + "grad_norm": 0.0005584516911767423, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 60405106.0, + "rewards/KL_reward/mean": -33.63987731933594, + "rewards/KL_reward/std": 0.9843823909759521, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 670 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7607709750566893, + "grad_norm": 0.0005905695143155754, + "learning_rate": 5e-05, + "loss": 0.0028, + "num_tokens": 60437050.0, + "rewards/KL_reward/mean": -33.20232391357422, + "rewards/KL_reward/std": 1.0796029567718506, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 671 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7619047619047619, + "grad_norm": 0.0010460438206791878, + "learning_rate": 5e-05, + "loss": -0.0022, + "num_tokens": 60469426.0, + "rewards/KL_reward/mean": -33.624046325683594, + "rewards/KL_reward/std": 1.0314666032791138, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 672 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7630385487528345, + "grad_norm": 0.001069650985300541, + "learning_rate": 5e-05, + "loss": -0.0006, + "num_tokens": 60501690.0, + "rewards/KL_reward/mean": -33.50665283203125, + "rewards/KL_reward/std": 1.207753300666809, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 673 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.764172335600907, + "grad_norm": 0.00037095643347129226, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 60534314.0, + "rewards/KL_reward/mean": -32.88195037841797, + "rewards/KL_reward/std": 0.6923331022262573, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 674 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7653061224489796, + "grad_norm": 0.0015014632372185588, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_tokens": 60566250.0, + "rewards/KL_reward/mean": -33.47536849975586, + "rewards/KL_reward/std": 1.4117809534072876, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 675 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7664399092970522, + "grad_norm": 0.0032663193996995687, + "learning_rate": 5e-05, + "loss": -0.002, + "num_tokens": 60598346.0, + "rewards/KL_reward/mean": -33.787513732910156, + "rewards/KL_reward/std": 1.0018196105957031, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 676 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7675736961451247, + "grad_norm": 0.0004160176613368094, + "learning_rate": 5e-05, + "loss": 0.0022, + "num_tokens": 60630674.0, + "rewards/KL_reward/mean": -33.78032684326172, + "rewards/KL_reward/std": 0.9000056385993958, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 677 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7687074829931972, + "grad_norm": 0.0012931914534419775, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_tokens": 60662610.0, + "rewards/KL_reward/mean": -34.00667190551758, + "rewards/KL_reward/std": 1.7070261240005493, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 678 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7698412698412699, + "grad_norm": 0.2659655213356018, + "learning_rate": 5e-05, + "loss": -0.0017, + "num_tokens": 60694410.0, + "rewards/KL_reward/mean": -33.678863525390625, + "rewards/KL_reward/std": 1.627038836479187, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 679 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7709750566893424, + "grad_norm": 0.060414332896471024, + "learning_rate": 5e-05, + "loss": -0.0006, + "num_tokens": 60726946.0, + "rewards/KL_reward/mean": -33.41743469238281, + "rewards/KL_reward/std": 1.603200912475586, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 680 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7721088435374149, + "grad_norm": 0.0018846142338588834, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 60758514.0, + "rewards/KL_reward/mean": -33.86500549316406, + "rewards/KL_reward/std": 1.076326608657837, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 681 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7732426303854876, + "grad_norm": 0.001117186271585524, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_tokens": 60790578.0, + "rewards/KL_reward/mean": -33.70912170410156, + "rewards/KL_reward/std": 2.305903434753418, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 682 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7743764172335601, + "grad_norm": 0.0017752180574461818, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_tokens": 60821994.0, + "rewards/KL_reward/mean": -33.787025451660156, + "rewards/KL_reward/std": 1.4653655290603638, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 683 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7755102040816326, + "grad_norm": 0.05909721180796623, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 60854154.0, + "rewards/KL_reward/mean": -33.027061462402344, + "rewards/KL_reward/std": 1.6789178848266602, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 684 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7766439909297053, + "grad_norm": 0.0009766574949026108, + "learning_rate": 5e-05, + "loss": -0.0011, + "num_tokens": 60886138.0, + "rewards/KL_reward/mean": -33.709434509277344, + "rewards/KL_reward/std": 1.4155031442642212, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 685 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.7777777777777778, + "grad_norm": 0.3626880645751953, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_tokens": 60918075.0, + "rewards/KL_reward/mean": -32.75965881347656, + "rewards/KL_reward/std": 2.005737543106079, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 686 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7789115646258503, + "grad_norm": 0.0031591549050062895, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_tokens": 60949803.0, + "rewards/KL_reward/mean": -33.69989013671875, + "rewards/KL_reward/std": 1.6075519323349, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 687 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.780045351473923, + "grad_norm": 0.24598188698291779, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_tokens": 60981692.0, + "rewards/KL_reward/mean": -32.968814849853516, + "rewards/KL_reward/std": 2.120867967605591, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 688 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7811791383219955, + "grad_norm": 0.003578277537599206, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 61013236.0, + "rewards/KL_reward/mean": -33.44221496582031, + "rewards/KL_reward/std": 1.487945318222046, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 689 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.782312925170068, + "grad_norm": 0.02068578079342842, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 61045420.0, + "rewards/KL_reward/mean": -33.07246398925781, + "rewards/KL_reward/std": 1.2704086303710938, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 690 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.7834467120181405, + "grad_norm": 0.2770971953868866, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 61077428.0, + "rewards/KL_reward/mean": -33.59265899658203, + "rewards/KL_reward/std": 1.986149787902832, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 691 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.7845804988662132, + "grad_norm": 0.3363737463951111, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 61109413.0, + "rewards/KL_reward/mean": -34.014347076416016, + "rewards/KL_reward/std": 1.1279405355453491, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 692 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.015625, + "epoch": 0.7857142857142857, + "grad_norm": 0.39339056611061096, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61141135.0, + "rewards/KL_reward/mean": -33.55498504638672, + "rewards/KL_reward/std": 1.4367928504943848, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 693 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.7868480725623582, + "grad_norm": 0.2678931653499603, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61173496.0, + "rewards/KL_reward/mean": -32.55483627319336, + "rewards/KL_reward/std": 1.5880851745605469, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 694 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.046875, + "epoch": 0.7879818594104309, + "grad_norm": 1.5173062086105347, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61205470.0, + "rewards/KL_reward/mean": -32.813358306884766, + "rewards/KL_reward/std": 3.2993099689483643, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 695 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.140625, + "epoch": 0.7891156462585034, + "grad_norm": 2.8795690536499023, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61237992.0, + "rewards/KL_reward/mean": -33.01683044433594, + "rewards/KL_reward/std": 2.0140583515167236, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 696 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.390625, + "epoch": 0.7902494331065759, + "grad_norm": 4.886504173278809, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61269946.0, + "rewards/KL_reward/mean": -31.504554748535156, + "rewards/KL_reward/std": 2.4660799503326416, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 697 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.65625, + "epoch": 0.7913832199546486, + "grad_norm": 4.877257347106934, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61301942.0, + "rewards/KL_reward/mean": -30.575576782226562, + "rewards/KL_reward/std": 1.9110974073410034, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 698 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.8359375, + "epoch": 0.7925170068027211, + "grad_norm": 2.782092809677124, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61334393.0, + "rewards/KL_reward/mean": -30.690383911132812, + "rewards/KL_reward/std": 1.3091487884521484, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 699 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.96875, + "epoch": 0.7936507936507936, + "grad_norm": 1.6190053224563599, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61366973.0, + "rewards/KL_reward/mean": -29.413843154907227, + "rewards/KL_reward/std": 2.7619032859802246, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.06629125773906708, + "rewards/angle_reward/std": 0.24199791252613068, + "rewards/thinking_verbosity_reward/mean": -0.000943052233196795, + "rewards/thinking_verbosity_reward/std": 0.007626189850270748, + "step": 700 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.9921875, + "epoch": 0.7947845804988662, + "grad_norm": 0.9022070169448853, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61399556.0, + "rewards/KL_reward/mean": -29.979713439941406, + "rewards/KL_reward/std": 1.8409754037857056, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 701 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 2.1640625, + "epoch": 0.7959183673469388, + "grad_norm": 1.317862868309021, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61431833.0, + "rewards/KL_reward/mean": -28.689163208007812, + "rewards/KL_reward/std": 3.999366283416748, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.09943688660860062, + "rewards/angle_reward/std": 0.29072776436805725, + "rewards/thinking_verbosity_reward/mean": -0.002045338973402977, + "rewards/thinking_verbosity_reward/std": 0.013879266567528248, + "step": 702 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 2.1328125, + "epoch": 0.7970521541950113, + "grad_norm": 1.5643107891082764, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61464122.0, + "rewards/KL_reward/mean": -28.878559112548828, + "rewards/KL_reward/std": 2.922321319580078, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.06629125773906708, + "rewards/angle_reward/std": 0.24199791252613068, + "rewards/thinking_verbosity_reward/mean": -0.000943052233196795, + "rewards/thinking_verbosity_reward/std": 0.007626189850270748, + "step": 703 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 2.484375, + "epoch": 0.7981859410430839, + "grad_norm": 2.655838966369629, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61496728.0, + "rewards/KL_reward/mean": -26.264991760253906, + "rewards/KL_reward/std": 4.358268737792969, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.18782523274421692, + "rewards/angle_reward/std": 0.3916890025138855, + "rewards/thinking_verbosity_reward/mean": -0.0027343749534338713, + "rewards/thinking_verbosity_reward/std": 0.011413133703172207, + "step": 704 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 2.8671875, + "epoch": 0.7993197278911565, + "grad_norm": 2.6706795692443848, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61529255.0, + "rewards/KL_reward/mean": -23.64773178100586, + "rewards/KL_reward/std": 4.189241409301758, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 705 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 3.6796875, + "epoch": 0.800453514739229, + "grad_norm": 2.53173565864563, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61561038.0, + "rewards/KL_reward/mean": -21.3406982421875, + "rewards/KL_reward/std": 5.221982955932617, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.022097086533904076, + "rewards/angle_reward/std": 0.17607934772968292, + "rewards/thinking_verbosity_reward/mean": -0.0011048543965443969, + "rewards/thinking_verbosity_reward/std": 0.00880396831780672, + "step": 706 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 5.890625, + "epoch": 0.8015873015873016, + "grad_norm": 4.612654209136963, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61593640.0, + "rewards/KL_reward/mean": -16.051362991333008, + "rewards/KL_reward/std": 6.481986045837402, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.18782523274421692, + "rewards/angle_reward/std": 0.3916890025138855, + "rewards/thinking_verbosity_reward/mean": -0.0035057389177381992, + "rewards/thinking_verbosity_reward/std": 0.014961066655814648, + "step": 707 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 8.75, + "epoch": 0.8027210884353742, + "grad_norm": 1.8879761695861816, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61626128.0, + "rewards/KL_reward/mean": -12.419229507446289, + "rewards/KL_reward/std": 6.6007513999938965, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.09943688660860062, + "rewards/angle_reward/std": 0.3406151831150055, + "rewards/thinking_verbosity_reward/mean": -0.002896177349612117, + "rewards/thinking_verbosity_reward/std": 0.01220763847231865, + "step": 708 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 11.8828125, + "epoch": 0.8038548752834467, + "grad_norm": 1.5015785694122314, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61658801.0, + "rewards/KL_reward/mean": -9.83292007446289, + "rewards/KL_reward/std": 6.10657262802124, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.22097086906433105, + "rewards/angle_reward/std": 0.48399582505226135, + "rewards/thinking_verbosity_reward/mean": -0.007983904331922531, + "rewards/thinking_verbosity_reward/std": 0.022066637873649597, + "step": 709 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 15.640625, + "epoch": 0.8049886621315193, + "grad_norm": 2.836428165435791, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61692459.0, + "rewards/KL_reward/mean": -7.300546646118164, + "rewards/KL_reward/std": 4.535953521728516, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.37565046548843384, + "rewards/angle_reward/std": 0.5169374942779541, + "rewards/thinking_verbosity_reward/mean": -0.010226922109723091, + "rewards/thinking_verbosity_reward/std": 0.02758130244910717, + "step": 710 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 20.21875, + "epoch": 0.8061224489795918, + "grad_norm": 2.2767698764801025, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61726919.0, + "rewards/KL_reward/mean": -5.976463794708252, + "rewards/KL_reward/std": 4.255782127380371, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.408796101808548, + "rewards/angle_reward/std": 0.522029459476471, + "rewards/thinking_verbosity_reward/mean": -0.010747408494353294, + "rewards/thinking_verbosity_reward/std": 0.027381112799048424, + "step": 711 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 45.9921875, + "epoch": 0.8072562358276644, + "grad_norm": 2.9280054569244385, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61764870.0, + "rewards/KL_reward/mean": -6.136072158813477, + "rewards/KL_reward/std": 4.335809707641602, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.3314563035964966, + "rewards/angle_reward/std": 0.5746446847915649, + "rewards/thinking_verbosity_reward/mean": -0.046691492199897766, + "rewards/thinking_verbosity_reward/std": 0.24875198304653168, + "step": 712 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 90.9140625, + "epoch": 0.808390022675737, + "grad_norm": 2.419306516647339, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61808875.0, + "rewards/KL_reward/mean": -5.737112045288086, + "rewards/KL_reward/std": 5.006260871887207, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.30935922265052795, + "rewards/angle_reward/std": 0.6131755709648132, + "rewards/thinking_verbosity_reward/mean": -0.10689996182918549, + "rewards/thinking_verbosity_reward/std": 0.35072776675224304, + "step": 713 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 85.3515625, + "epoch": 0.8095238095238095, + "grad_norm": 4.568090915679932, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 61852472.0, + "rewards/KL_reward/mean": -5.3782958984375, + "rewards/KL_reward/std": 4.274779796600342, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.3314563035964966, + "rewards/angle_reward/std": 0.5465532541275024, + "rewards/thinking_verbosity_reward/mean": -0.10199468582868576, + "rewards/thinking_verbosity_reward/std": 0.35763248801231384, + "step": 714 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 146.1015625, + "epoch": 0.8106575963718821, + "grad_norm": 2.923835515975952, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61903389.0, + "rewards/KL_reward/mean": -5.380932807922363, + "rewards/KL_reward/std": 4.547464370727539, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.19887377321720123, + "rewards/angle_reward/std": 0.681230366230011, + "rewards/thinking_verbosity_reward/mean": -0.2453625500202179, + "rewards/thinking_verbosity_reward/std": 0.4244639575481415, + "step": 715 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 195.171875, + "epoch": 0.8117913832199547, + "grad_norm": 0.8407360315322876, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 61960347.0, + "rewards/KL_reward/mean": -3.7000811100006104, + "rewards/KL_reward/std": 3.552907943725586, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.20992231369018555, + "rewards/angle_reward/std": 0.6041808128356934, + "rewards/thinking_verbosity_reward/mean": -0.23007535934448242, + "rewards/thinking_verbosity_reward/std": 0.4760555028915405, + "step": 716 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 352.546875, + "epoch": 0.8129251700680272, + "grad_norm": 2.3982837200164795, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 62037737.0, + "rewards/KL_reward/mean": -3.6907198429107666, + "rewards/KL_reward/std": 3.331489324569702, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.15467959642410278, + "rewards/angle_reward/std": 0.6695720553398132, + "rewards/thinking_verbosity_reward/mean": -0.4037805199623108, + "rewards/thinking_verbosity_reward/std": 0.6441807746887207, + "step": 717 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 186.6953125, + "epoch": 0.8140589569160998, + "grad_norm": 1.133590579032898, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 62093306.0, + "rewards/KL_reward/mean": -3.6116247177124023, + "rewards/KL_reward/std": 2.6408379077911377, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.19887377321720123, + "rewards/angle_reward/std": 0.6577072143554688, + "rewards/thinking_verbosity_reward/mean": -0.21827402710914612, + "rewards/thinking_verbosity_reward/std": 0.47493577003479004, + "step": 718 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 302.1328125, + "epoch": 0.8151927437641724, + "grad_norm": 2.1491713523864746, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 62164507.0, + "rewards/KL_reward/mean": -3.0284664630889893, + "rewards/KL_reward/std": 2.8550503253936768, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.4087960720062256, + "rewards/angle_reward/std": 0.5792295336723328, + "rewards/thinking_verbosity_reward/mean": -0.2621035575866699, + "rewards/thinking_verbosity_reward/std": 0.6942629218101501, + "step": 719 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 233.4296875, + "epoch": 0.8163265306122449, + "grad_norm": 0.5096162557601929, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 62226674.0, + "rewards/KL_reward/mean": -2.5021750926971436, + "rewards/KL_reward/std": 1.2480705976486206, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.3425048589706421, + "rewards/angle_reward/std": 0.5680770874023438, + "rewards/thinking_verbosity_reward/mean": -0.17499490082263947, + "rewards/thinking_verbosity_reward/std": 0.5807626247406006, + "step": 720 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 170.0390625, + "epoch": 0.8174603174603174, + "grad_norm": 1.7022823095321655, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 62279327.0, + "rewards/KL_reward/mean": -2.7811903953552246, + "rewards/KL_reward/std": 2.0388731956481934, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.39774757623672485, + "rewards/angle_reward/std": 0.5305619835853577, + "rewards/thinking_verbosity_reward/mean": -0.13159838318824768, + "rewards/thinking_verbosity_reward/std": 0.5158379673957825, + "step": 721 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 165.59375, + "epoch": 0.81859410430839, + "grad_norm": 1.1229782104492188, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 62333123.0, + "rewards/KL_reward/mean": -2.5608346462249756, + "rewards/KL_reward/std": 1.7398568391799927, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.38669902086257935, + "rewards/angle_reward/std": 0.5672101378440857, + "rewards/thinking_verbosity_reward/mean": -0.13085666298866272, + "rewards/thinking_verbosity_reward/std": 0.4741191267967224, + "step": 722 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 394.4765625, + "epoch": 0.8197278911564626, + "grad_norm": 1.1178206205368042, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 62415904.0, + "rewards/KL_reward/mean": -2.433047294616699, + "rewards/KL_reward/std": 2.6464271545410156, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.3314563035964966, + "rewards/angle_reward/std": 0.5746446251869202, + "rewards/thinking_verbosity_reward/mean": -0.3151102662086487, + "rewards/thinking_verbosity_reward/std": 0.7379494309425354, + "step": 723 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 414.7890625, + "epoch": 0.8208616780045351, + "grad_norm": 3.102323293685913, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 62500749.0, + "rewards/KL_reward/mean": -2.4026496410369873, + "rewards/KL_reward/std": 2.3765244483947754, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.3425048291683197, + "rewards/angle_reward/std": 0.5951534509658813, + "rewards/thinking_verbosity_reward/mean": -0.33660024404525757, + "rewards/thinking_verbosity_reward/std": 0.7675768733024597, + "step": 724 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 844.5078125, + "epoch": 0.8219954648526077, + "grad_norm": 0.27251550555229187, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 62640990.0, + "rewards/KL_reward/mean": -1.4234731197357178, + "rewards/KL_reward/std": 1.720995545387268, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.1657281517982483, + "rewards/angle_reward/std": 0.642855167388916, + "rewards/thinking_verbosity_reward/mean": -0.7711927890777588, + "rewards/thinking_verbosity_reward/std": 0.8828684091567993, + "step": 725 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1464.8359375, + "epoch": 0.8231292517006803, + "grad_norm": 0.2615896761417389, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 62860689.0, + "rewards/KL_reward/mean": -0.6533093452453613, + "rewards/KL_reward/std": 1.104088544845581, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.011048540472984314, + "rewards/angle_reward/std": 0.7097985744476318, + "rewards/thinking_verbosity_reward/mean": -1.3578754663467407, + "rewards/thinking_verbosity_reward/std": 0.7664743661880493, + "step": 726 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1577.6015625, + "epoch": 0.8242630385487528, + "grad_norm": 0.5463308095932007, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 63094590.0, + "rewards/KL_reward/mean": -0.7473901510238647, + "rewards/KL_reward/std": 1.843743920326233, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314563259482384, + "rewards/angle_reward/std": 0.7091048955917358, + "rewards/thinking_verbosity_reward/mean": -1.3820425271987915, + "rewards/thinking_verbosity_reward/std": 0.831028163433075, + "step": 727 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1776.9375, + "epoch": 0.8253968253968254, + "grad_norm": 0.05059307813644409, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 63354686.0, + "rewards/KL_reward/mean": -0.37151604890823364, + "rewards/KL_reward/std": 0.4577194154262543, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.04419416934251785, + "rewards/angle_reward/std": 0.7084973454475403, + "rewards/thinking_verbosity_reward/mean": -1.624656081199646, + "rewards/thinking_verbosity_reward/std": 0.5813239216804504, + "step": 728 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1764.0078125, + "epoch": 0.826530612244898, + "grad_norm": 0.01426609791815281, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 63612815.0, + "rewards/KL_reward/mean": -0.34279125928878784, + "rewards/KL_reward/std": 0.4354683458805084, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.022097088396549225, + "rewards/angle_reward/std": 0.709538459777832, + "rewards/thinking_verbosity_reward/mean": -1.6274614334106445, + "rewards/thinking_verbosity_reward/std": 0.5408163666725159, + "step": 729 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1806.03125, + "epoch": 0.8276643990929705, + "grad_norm": 0.011248442344367504, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 63876291.0, + "rewards/KL_reward/mean": -0.29520583152770996, + "rewards/KL_reward/std": 0.3265356421470642, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.03314562886953354, + "rewards/angle_reward/std": 0.7091048955917358, + "rewards/thinking_verbosity_reward/mean": -1.656254768371582, + "rewards/thinking_verbosity_reward/std": 0.5175119638442993, + "step": 730 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1666.96875, + "epoch": 0.828798185941043, + "grad_norm": 0.011705402284860611, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 64121343.0, + "rewards/KL_reward/mean": -0.3267815411090851, + "rewards/KL_reward/std": 0.5070149302482605, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.022097084671258926, + "rewards/angle_reward/std": 0.709538459777832, + "rewards/thinking_verbosity_reward/mean": -1.5810213088989258, + "rewards/thinking_verbosity_reward/std": 0.5288156867027283, + "step": 731 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1932.1640625, + "epoch": 0.8299319727891157, + "grad_norm": 0.029215455055236816, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 64401052.0, + "rewards/KL_reward/mean": -0.35986757278442383, + "rewards/KL_reward/std": 0.8738176226615906, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.12153397500514984, + "rewards/angle_reward/std": 0.6993212699890137, + "rewards/thinking_verbosity_reward/mean": -1.7087500095367432, + "rewards/thinking_verbosity_reward/std": 0.5540984272956848, + "step": 732 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1894.4765625, + "epoch": 0.8310657596371882, + "grad_norm": 0.014189718291163445, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 64675121.0, + "rewards/KL_reward/mean": -0.33786386251449585, + "rewards/KL_reward/std": 0.568859875202179, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.13258251547813416, + "rewards/angle_reward/std": 0.6972951292991638, + "rewards/thinking_verbosity_reward/mean": -1.6881887912750244, + "rewards/thinking_verbosity_reward/std": 0.5704836845397949, + "step": 733 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1913.125, + "epoch": 0.8321995464852607, + "grad_norm": 0.3056281805038452, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 64952009.0, + "rewards/KL_reward/mean": -0.5387207269668579, + "rewards/KL_reward/std": 1.6692149639129639, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.13258251547813416, + "rewards/angle_reward/std": 0.6972951292991638, + "rewards/thinking_verbosity_reward/mean": -1.685068130493164, + "rewards/thinking_verbosity_reward/std": 0.5932014584541321, + "step": 734 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1809.8359375, + "epoch": 0.8333333333333334, + "grad_norm": 0.011273681186139584, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 65215924.0, + "rewards/KL_reward/mean": -0.28363704681396484, + "rewards/KL_reward/std": 0.3439120054244995, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.03314562886953354, + "rewards/angle_reward/std": 0.7091048955917358, + "rewards/thinking_verbosity_reward/mean": -1.6670418977737427, + "rewards/thinking_verbosity_reward/std": 0.5123423337936401, + "step": 735 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1822.828125, + "epoch": 0.8344671201814059, + "grad_norm": 0.011458742432296276, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 65480478.0, + "rewards/KL_reward/mean": -0.3005499839782715, + "rewards/KL_reward/std": 0.38731321692466736, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.03314562886953354, + "rewards/angle_reward/std": 0.7091048955917358, + "rewards/thinking_verbosity_reward/mean": -1.6545227766036987, + "rewards/thinking_verbosity_reward/std": 0.5496739745140076, + "step": 736 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1889.734375, + "epoch": 0.8356009070294784, + "grad_norm": 0.009957044385373592, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 65754076.0, + "rewards/KL_reward/mean": -0.28186577558517456, + "rewards/KL_reward/std": 0.3413471579551697, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.07733979821205139, + "rewards/angle_reward/std": 0.7056263089179993, + "rewards/thinking_verbosity_reward/mean": -1.696712613105774, + "rewards/thinking_verbosity_reward/std": 0.5212931632995605, + "step": 737 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1938.4375, + "epoch": 0.8367346938775511, + "grad_norm": 0.00860871933400631, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 66033868.0, + "rewards/KL_reward/mean": -0.25816720724105835, + "rewards/KL_reward/std": 0.30099010467529297, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.04419417306780815, + "rewards/angle_reward/std": 0.7084973454475403, + "rewards/thinking_verbosity_reward/mean": -1.7327439785003662, + "rewards/thinking_verbosity_reward/std": 0.5168812870979309, + "step": 738 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1825.859375, + "epoch": 0.8378684807256236, + "grad_norm": 0.013760825619101524, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 66298474.0, + "rewards/KL_reward/mean": -0.3341813087463379, + "rewards/KL_reward/std": 0.7779108285903931, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.07733979821205139, + "rewards/angle_reward/std": 0.7056263089179993, + "rewards/thinking_verbosity_reward/mean": -1.6584984064102173, + "rewards/thinking_verbosity_reward/std": 0.54533451795578, + "step": 739 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 2007.4453125, + "epoch": 0.8390022675736961, + "grad_norm": 0.23512808978557587, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 66587291.0, + "rewards/KL_reward/mean": -0.5901498794555664, + "rewards/KL_reward/std": 2.1927568912506104, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.19887377321720123, + "rewards/angle_reward/std": 0.681230366230011, + "rewards/thinking_verbosity_reward/mean": -1.7421135902404785, + "rewards/thinking_verbosity_reward/std": 0.5695531964302063, + "step": 740 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1833.0703125, + "epoch": 0.8401360544217688, + "grad_norm": 0.19070129096508026, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 66853860.0, + "rewards/KL_reward/mean": -0.41403061151504517, + "rewards/KL_reward/std": 1.0047106742858887, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.07733979821205139, + "rewards/angle_reward/std": 0.7056263089179993, + "rewards/thinking_verbosity_reward/mean": -1.6566379070281982, + "rewards/thinking_verbosity_reward/std": 0.5692102909088135, + "step": 741 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1909.1328125, + "epoch": 0.8412698412698413, + "grad_norm": 0.01683916337788105, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 67129485.0, + "rewards/KL_reward/mean": -0.37234920263290405, + "rewards/KL_reward/std": 1.2858551740646362, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.0883883386850357, + "rewards/angle_reward/std": 0.7043173909187317, + "rewards/thinking_verbosity_reward/mean": -1.7029697895050049, + "rewards/thinking_verbosity_reward/std": 0.5322949290275574, + "step": 742 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1864.59375, + "epoch": 0.8424036281179138, + "grad_norm": 0.2072633057832718, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 67400073.0, + "rewards/KL_reward/mean": -0.38009560108184814, + "rewards/KL_reward/std": 1.0967984199523926, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.0883883461356163, + "rewards/angle_reward/std": 0.7043173909187317, + "rewards/thinking_verbosity_reward/mean": -1.6811352968215942, + "rewards/thinking_verbosity_reward/std": 0.5584313869476318, + "step": 743 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1785.5, + "epoch": 0.8435374149659864, + "grad_norm": 0.15221014618873596, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 67661073.0, + "rewards/KL_reward/mean": -0.8090833425521851, + "rewards/KL_reward/std": 2.7380833625793457, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.055242717266082764, + "rewards/angle_reward/std": 0.7077155113220215, + "rewards/thinking_verbosity_reward/mean": -1.6210336685180664, + "rewards/thinking_verbosity_reward/std": 0.6052948832511902, + "step": 744 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1860.8046875, + "epoch": 0.844671201814059, + "grad_norm": 0.2941654324531555, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 67931328.0, + "rewards/KL_reward/mean": -0.4073900580406189, + "rewards/KL_reward/std": 1.372942566871643, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.0883883461356163, + "rewards/angle_reward/std": 0.7043173909187317, + "rewards/thinking_verbosity_reward/mean": -1.6936153173446655, + "rewards/thinking_verbosity_reward/std": 0.5113928318023682, + "step": 745 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1861.3046875, + "epoch": 0.8458049886621315, + "grad_norm": 0.07613295316696167, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 68201767.0, + "rewards/KL_reward/mean": -0.5628873705863953, + "rewards/KL_reward/std": 2.050913095474243, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.09943689405918121, + "rewards/angle_reward/std": 0.7028310298919678, + "rewards/thinking_verbosity_reward/mean": -1.6811420917510986, + "rewards/thinking_verbosity_reward/std": 0.618983805179596, + "step": 746 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1708.703125, + "epoch": 0.8469387755102041, + "grad_norm": 0.5091060400009155, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 68452281.0, + "rewards/KL_reward/mean": -0.9779430627822876, + "rewards/KL_reward/std": 3.240374803543091, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.09943688660860062, + "rewards/angle_reward/std": 0.7028310298919678, + "rewards/thinking_verbosity_reward/mean": -1.5964150428771973, + "rewards/thinking_verbosity_reward/std": 0.6059854030609131, + "step": 747 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1769.890625, + "epoch": 0.8480725623582767, + "grad_norm": 0.3866634666919708, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 68710355.0, + "rewards/KL_reward/mean": -0.5445457696914673, + "rewards/KL_reward/std": 1.5668498277664185, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.03314562886953354, + "rewards/angle_reward/std": 0.7091048955917358, + "rewards/thinking_verbosity_reward/mean": -1.6147034168243408, + "rewards/thinking_verbosity_reward/std": 0.6311928033828735, + "step": 748 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1770.3125, + "epoch": 0.8492063492063492, + "grad_norm": 0.23417192697525024, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 68969363.0, + "rewards/KL_reward/mean": -0.6993677616119385, + "rewards/KL_reward/std": 2.2965004444122314, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.06629125773906708, + "rewards/angle_reward/std": 0.7067587375640869, + "rewards/thinking_verbosity_reward/mean": -1.6513805389404297, + "rewards/thinking_verbosity_reward/std": 0.6273873448371887, + "step": 749 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1711.46875, + "epoch": 0.8503401360544217, + "grad_norm": 0.4115210175514221, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 69220487.0, + "rewards/KL_reward/mean": -1.3880504369735718, + "rewards/KL_reward/std": 3.688795804977417, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.0883883461356163, + "rewards/angle_reward/std": 0.7043173909187317, + "rewards/thinking_verbosity_reward/mean": -1.5875732898712158, + "rewards/thinking_verbosity_reward/std": 0.7431985139846802, + "step": 750 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1631.078125, + "epoch": 0.8514739229024944, + "grad_norm": 0.6005513668060303, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 69460737.0, + "rewards/KL_reward/mean": -1.3629591464996338, + "rewards/KL_reward/std": 3.653020143508911, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.04419417306780815, + "rewards/angle_reward/std": 0.7084973454475403, + "rewards/thinking_verbosity_reward/mean": -1.5267325639724731, + "rewards/thinking_verbosity_reward/std": 0.7454755902290344, + "step": 751 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1621.4609375, + "epoch": 0.8526077097505669, + "grad_norm": 0.5353714823722839, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 69700668.0, + "rewards/KL_reward/mean": -1.4000484943389893, + "rewards/KL_reward/std": 3.486201524734497, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.0883883461356163, + "rewards/angle_reward/mean": -0.04390813782811165, + "rewards/angle_reward/std": 0.7173286080360413, + "rewards/thinking_verbosity_reward/mean": -1.5657247304916382, + "rewards/thinking_verbosity_reward/std": 0.778367817401886, + "step": 752 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1548.1796875, + "epoch": 0.8537414965986394, + "grad_norm": 1.4211868047714233, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 69930411.0, + "rewards/KL_reward/mean": -1.7806217670440674, + "rewards/KL_reward/std": 4.288891315460205, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.0883883461356163, + "rewards/angle_reward/mean": -0.031445786356925964, + "rewards/angle_reward/std": 0.7157651782035828, + "rewards/thinking_verbosity_reward/mean": -1.5117034912109375, + "rewards/thinking_verbosity_reward/std": 0.8444975018501282, + "step": 753 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1287.015625, + "epoch": 0.854875283446712, + "grad_norm": 0.9926010370254517, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70126653.0, + "rewards/KL_reward/mean": -2.841733455657959, + "rewards/KL_reward/std": 5.6521501541137695, + "rewards/accuracy_reward/mean": 0.0078125, + "rewards/accuracy_reward/std": 0.0883883461356163, + "rewards/angle_reward/mean": 0.020758455619215965, + "rewards/angle_reward/std": 0.7000128626823425, + "rewards/thinking_verbosity_reward/mean": -1.3156667947769165, + "rewards/thinking_verbosity_reward/std": 0.8615114688873291, + "step": 754 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1056.078125, + "epoch": 0.8560090702947846, + "grad_norm": 1.198169469833374, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 70293687.0, + "rewards/KL_reward/mean": -4.479970932006836, + "rewards/KL_reward/std": 6.986310958862305, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.05524270981550217, + "rewards/angle_reward/std": 0.7077155113220215, + "rewards/thinking_verbosity_reward/mean": -1.1392841339111328, + "rewards/thinking_verbosity_reward/std": 0.927627444267273, + "step": 755 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 541.9453125, + "epoch": 0.8571428571428571, + "grad_norm": 1.0973591804504395, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70395168.0, + "rewards/KL_reward/mean": -9.952540397644043, + "rewards/KL_reward/std": 8.075974464416504, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.2651650309562683, + "rewards/angle_reward/std": 0.6580811738967896, + "rewards/thinking_verbosity_reward/mean": -0.6300583481788635, + "rewards/thinking_verbosity_reward/std": 0.8170543909072876, + "step": 756 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 462.8046875, + "epoch": 0.8582766439909297, + "grad_norm": 1.203218936920166, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70486183.0, + "rewards/KL_reward/mean": -11.928443908691406, + "rewards/KL_reward/std": 7.501312732696533, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.3314563035964966, + "rewards/angle_reward/std": 0.6270634531974792, + "rewards/thinking_verbosity_reward/mean": -0.5203590393066406, + "rewards/thinking_verbosity_reward/std": 0.7906116843223572, + "step": 757 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 296.5703125, + "epoch": 0.8594104308390023, + "grad_norm": 2.216782569885254, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70556584.0, + "rewards/KL_reward/mean": -12.616353988647461, + "rewards/KL_reward/std": 7.149302959442139, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.22097086906433105, + "rewards/angle_reward/std": 0.5733586549758911, + "rewards/thinking_verbosity_reward/mean": -0.3910168409347534, + "rewards/thinking_verbosity_reward/std": 0.6347672939300537, + "step": 758 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 292.3203125, + "epoch": 0.8605442176870748, + "grad_norm": 0.9063274264335632, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 70625433.0, + "rewards/KL_reward/mean": -13.474018096923828, + "rewards/KL_reward/std": 7.214576244354248, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.2651650309562683, + "rewards/angle_reward/std": 0.5818785429000854, + "rewards/thinking_verbosity_reward/mean": -0.3680382966995239, + "rewards/thinking_verbosity_reward/std": 0.594377338886261, + "step": 759 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 165.984375, + "epoch": 0.8616780045351474, + "grad_norm": 0.2434951364994049, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70678599.0, + "rewards/KL_reward/mean": -14.889543533325195, + "rewards/KL_reward/std": 5.734178066253662, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.3425048291683197, + "rewards/angle_reward/std": 0.5096268057823181, + "rewards/thinking_verbosity_reward/mean": -0.24580667912960052, + "rewards/thinking_verbosity_reward/std": 0.48874032497406006, + "step": 760 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 93.9765625, + "epoch": 0.86281179138322, + "grad_norm": 0.23471392691135406, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70722692.0, + "rewards/KL_reward/mean": -15.928576469421387, + "rewards/KL_reward/std": 4.604101657867432, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.23201939463615417, + "rewards/angle_reward/std": 0.44464775919914246, + "rewards/thinking_verbosity_reward/mean": -0.1656557023525238, + "rewards/thinking_verbosity_reward/std": 0.35872092843055725, + "step": 761 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 74.8515625, + "epoch": 0.8639455782312925, + "grad_norm": 0.3464663624763489, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70764873.0, + "rewards/KL_reward/mean": -15.698003768920898, + "rewards/KL_reward/std": 4.995001316070557, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.2541164755821228, + "rewards/angle_reward/std": 0.46731242537498474, + "rewards/thinking_verbosity_reward/mean": -0.163039430975914, + "rewards/thinking_verbosity_reward/std": 0.31206098198890686, + "step": 762 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 22.109375, + "epoch": 0.8650793650793651, + "grad_norm": 0.16134105622768402, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70799663.0, + "rewards/KL_reward/mean": -16.368640899658203, + "rewards/KL_reward/std": 3.071010112762451, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.15467959642410278, + "rewards/angle_reward/std": 0.36519327759742737, + "rewards/thinking_verbosity_reward/mean": -0.09887511283159256, + "rewards/thinking_verbosity_reward/std": 0.16042843461036682, + "step": 763 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 49.7890625, + "epoch": 0.8662131519274376, + "grad_norm": 0.19804765284061432, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 70837868.0, + "rewards/KL_reward/mean": -16.30763053894043, + "rewards/KL_reward/std": 4.001601219177246, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.2651650309562683, + "rewards/angle_reward/std": 0.42556124925613403, + "rewards/thinking_verbosity_reward/mean": -0.12632089853286743, + "rewards/thinking_verbosity_reward/std": 0.25668248534202576, + "step": 764 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 32.21875, + "epoch": 0.8673469387755102, + "grad_norm": 0.18399427831172943, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70873632.0, + "rewards/KL_reward/mean": -16.83941650390625, + "rewards/KL_reward/std": 2.848832130432129, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.13258251547813416, + "rewards/angle_reward/std": 0.3290405869483948, + "rewards/thinking_verbosity_reward/mean": -0.10092172026634216, + "rewards/thinking_verbosity_reward/std": 0.20551058650016785, + "step": 765 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 27.859375, + "epoch": 0.8684807256235828, + "grad_norm": 0.14216119050979614, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70909534.0, + "rewards/KL_reward/mean": -17.146116256713867, + "rewards/KL_reward/std": 2.3291776180267334, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.09943688660860062, + "rewards/angle_reward/std": 0.29072776436805725, + "rewards/thinking_verbosity_reward/mean": -0.0937347561120987, + "rewards/thinking_verbosity_reward/std": 0.1904277354478836, + "step": 766 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 27.328125, + "epoch": 0.8696145124716553, + "grad_norm": 0.07388874888420105, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70944712.0, + "rewards/KL_reward/mean": -16.985986709594727, + "rewards/KL_reward/std": 2.9843196868896484, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0883883461356163, + "rewards/angle_reward/std": 0.2943028509616852, + "rewards/thinking_verbosity_reward/mean": -0.10031235218048096, + "rewards/thinking_verbosity_reward/std": 0.18606983125209808, + "step": 767 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 11.6171875, + "epoch": 0.8707482993197279, + "grad_norm": 0.023837469518184662, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 70978007.0, + "rewards/KL_reward/mean": -17.148460388183594, + "rewards/KL_reward/std": 2.3115603923797607, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.09943688660860062, + "rewards/angle_reward/std": 0.29072776436805725, + "rewards/thinking_verbosity_reward/mean": -0.08404825627803802, + "rewards/thinking_verbosity_reward/std": 0.10351286828517914, + "step": 768 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 20.359375, + "epoch": 0.8718820861678005, + "grad_norm": 0.01366042997688055, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71012933.0, + "rewards/KL_reward/mean": -17.068466186523438, + "rewards/KL_reward/std": 1.6161787509918213, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0847688764333725, + "rewards/thinking_verbosity_reward/std": 0.15905039012432098, + "step": 769 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 13.5546875, + "epoch": 0.873015873015873, + "grad_norm": 0.15407995879650116, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71046300.0, + "rewards/KL_reward/mean": -16.896625518798828, + "rewards/KL_reward/std": 2.2293224334716797, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.13258251547813416, + "rewards/angle_reward/std": 0.3290405869483948, + "rewards/thinking_verbosity_reward/mean": -0.08467651158571243, + "rewards/thinking_verbosity_reward/std": 0.11787908524274826, + "step": 770 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 5.390625, + "epoch": 0.8741496598639455, + "grad_norm": 7.8302001953125, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71078734.0, + "rewards/KL_reward/mean": -17.01534080505371, + "rewards/KL_reward/std": 2.0076377391815186, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.06629125773906708, + "rewards/angle_reward/std": 0.3486475646495819, + "rewards/thinking_verbosity_reward/mean": -0.07630415260791779, + "rewards/thinking_verbosity_reward/std": 0.03890657052397728, + "step": 771 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.8752834467120182, + "grad_norm": 0.040065620094537735, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71111142.0, + "rewards/KL_reward/mean": -16.999956130981445, + "rewards/KL_reward/std": 0.7916744947433472, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.0707106813788414, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 772 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 13.7265625, + "epoch": 0.8764172335600907, + "grad_norm": 0.942425549030304, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71144795.0, + "rewards/KL_reward/mean": -16.707111358642578, + "rewards/KL_reward/std": 3.559162139892578, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.022097084671258926, + "rewards/angle_reward/std": 0.39621734619140625, + "rewards/thinking_verbosity_reward/mean": -0.08929823338985443, + "rewards/thinking_verbosity_reward/std": 0.11558838188648224, + "step": 773 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.0, + "epoch": 0.8775510204081632, + "grad_norm": 0.011615153402090073, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71176971.0, + "rewards/KL_reward/mean": -17.427825927734375, + "rewards/KL_reward/std": 0.44023942947387695, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": -0.0707106813788414, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 774 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 12.203125, + "epoch": 0.8786848072562359, + "grad_norm": 0.8019652962684631, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71210461.0, + "rewards/KL_reward/mean": -17.166763305664062, + "rewards/KL_reward/std": 1.8365185260772705, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.2509823143482208, + "rewards/thinking_verbosity_reward/mean": -0.07997027784585953, + "rewards/thinking_verbosity_reward/std": 0.11123532056808472, + "step": 775 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 15.296875, + "epoch": 0.8798185941043084, + "grad_norm": 0.6402688026428223, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71244547.0, + "rewards/KL_reward/mean": -17.005334854125977, + "rewards/KL_reward/std": 2.8297479152679443, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.022097086533904076, + "rewards/angle_reward/std": 0.3065877854824066, + "rewards/thinking_verbosity_reward/mean": -0.08827764540910721, + "rewards/thinking_verbosity_reward/std": 0.12712766230106354, + "step": 776 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.1640625, + "epoch": 0.8809523809523809, + "grad_norm": 0.09513384848833084, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71276896.0, + "rewards/KL_reward/mean": -17.158042907714844, + "rewards/KL_reward/std": 1.2754546403884888, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.07172074913978577, + "rewards/thinking_verbosity_reward/std": 0.011427669785916805, + "step": 777 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 5.09375, + "epoch": 0.8820861678004536, + "grad_norm": 2.8279929161071777, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71308932.0, + "rewards/KL_reward/mean": -17.66575050354004, + "rewards/KL_reward/std": 3.2178332805633545, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.011048544198274612, + "rewards/angle_reward/std": 0.39668282866477966, + "rewards/thinking_verbosity_reward/mean": -0.07160855084657669, + "rewards/thinking_verbosity_reward/std": 0.04149220883846283, + "step": 778 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 4.9296875, + "epoch": 0.8832199546485261, + "grad_norm": 5.776390075683594, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71341827.0, + "rewards/KL_reward/mean": -19.910785675048828, + "rewards/KL_reward/std": 6.193860054016113, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": -0.0883883386850357, + "rewards/angle_reward/std": 0.5818785429000854, + "rewards/thinking_verbosity_reward/mean": -0.0617288276553154, + "rewards/thinking_verbosity_reward/std": 0.05266613885760307, + "step": 779 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 50.2578125, + "epoch": 0.8843537414965986, + "grad_norm": 6.9208598136901855, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71380524.0, + "rewards/KL_reward/mean": -24.811588287353516, + "rewards/KL_reward/std": 8.277115821838379, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.14363105595111847, + "rewards/angle_reward/std": 0.6720480918884277, + "rewards/thinking_verbosity_reward/mean": -0.07564210891723633, + "rewards/thinking_verbosity_reward/std": 0.34243935346603394, + "step": 780 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.5859375, + "epoch": 0.8854875283446711, + "grad_norm": 4.5248003005981445, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71412351.0, + "rewards/KL_reward/mean": -30.427478790283203, + "rewards/KL_reward/std": 7.362793922424316, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.3425048589706421, + "rewards/angle_reward/std": 0.5680770874023438, + "rewards/thinking_verbosity_reward/mean": -0.014322892762720585, + "rewards/thinking_verbosity_reward/std": 0.030253706499934196, + "step": 781 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.21875, + "epoch": 0.8866213151927438, + "grad_norm": 1.312471866607666, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71444915.0, + "rewards/KL_reward/mean": -31.381324768066406, + "rewards/KL_reward/std": 4.721668243408203, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.14363107085227966, + "rewards/angle_reward/std": 0.3697133958339691, + "rewards/thinking_verbosity_reward/mean": -0.004610119387507439, + "rewards/thinking_verbosity_reward/std": 0.02076861448585987, + "step": 782 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 25.1015625, + "epoch": 0.8877551020408163, + "grad_norm": 1.2358919382095337, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71479432.0, + "rewards/KL_reward/mean": -32.86699676513672, + "rewards/KL_reward/std": 4.977630615234375, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.19887377321720123, + "rewards/angle_reward/std": 0.38615304231643677, + "rewards/thinking_verbosity_reward/mean": -0.02466108277440071, + "rewards/thinking_verbosity_reward/std": 0.24514424800872803, + "step": 783 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.03125, + "epoch": 0.8888888888888888, + "grad_norm": 0.6902227401733398, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71511756.0, + "rewards/KL_reward/mean": -32.74382019042969, + "rewards/KL_reward/std": 2.933997631072998, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.06629125773906708, + "rewards/angle_reward/std": 0.24199791252613068, + "rewards/thinking_verbosity_reward/mean": -0.0011048543965443969, + "rewards/thinking_verbosity_reward/std": 0.00880396831780672, + "step": 784 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0859375, + "epoch": 0.8900226757369615, + "grad_norm": 0.7480749487876892, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71543295.0, + "rewards/KL_reward/mean": -32.922210693359375, + "rewards/KL_reward/std": 3.685519218444824, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.13258251547813416, + "rewards/angle_reward/std": 0.3290405869483948, + "rewards/thinking_verbosity_reward/mean": -0.002333864104002714, + "rewards/thinking_verbosity_reward/std": 0.013102501630783081, + "step": 785 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.078125, + "epoch": 0.891156462585034, + "grad_norm": 1.5596857070922852, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 71575569.0, + "rewards/KL_reward/mean": -32.251678466796875, + "rewards/KL_reward/std": 4.577408790588379, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.12153397500514984, + "rewards/angle_reward/std": 0.33331283926963806, + "rewards/thinking_verbosity_reward/mean": -0.002762136049568653, + "rewards/thinking_verbosity_reward/std": 0.013753578998148441, + "step": 786 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0703125, + "epoch": 0.8922902494331065, + "grad_norm": 1.3572076559066772, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71607450.0, + "rewards/KL_reward/mean": -32.8837776184082, + "rewards/KL_reward/std": 3.814033269882202, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.1657281517982483, + "rewards/angle_reward/std": 0.3602752089500427, + "rewards/thinking_verbosity_reward/mean": -0.0024385314900428057, + "rewards/thinking_verbosity_reward/std": 0.012308008037507534, + "step": 787 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.8934240362811792, + "grad_norm": 0.04590372368693352, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_tokens": 71639386.0, + "rewards/KL_reward/mean": -33.47450637817383, + "rewards/KL_reward/std": 1.6878197193145752, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 788 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.8945578231292517, + "grad_norm": 1.8047031164169312, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71671795.0, + "rewards/KL_reward/mean": -33.27348327636719, + "rewards/KL_reward/std": 1.5237458944320679, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 789 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.8956916099773242, + "grad_norm": 0.055271029472351074, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 71703491.0, + "rewards/KL_reward/mean": -33.1529655456543, + "rewards/KL_reward/std": 2.925701141357422, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 790 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.015625, + "epoch": 0.8968253968253969, + "grad_norm": 0.25706303119659424, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 71735829.0, + "rewards/KL_reward/mean": -33.68821716308594, + "rewards/KL_reward/std": 2.3951783180236816, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, + "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, + "step": 791 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.8979591836734694, + "grad_norm": 0.011360935866832733, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_tokens": 71768021.0, + "rewards/KL_reward/mean": -33.742801666259766, + "rewards/KL_reward/std": 0.9701665043830872, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 792 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.8990929705215419, + "grad_norm": 0.003016524715349078, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71800237.0, + "rewards/KL_reward/mean": -33.488868713378906, + "rewards/KL_reward/std": 0.9943328499794006, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 793 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0234375, + "epoch": 0.9002267573696145, + "grad_norm": 1.0178414583206177, + "learning_rate": 5e-05, + "loss": -0.0007, + "num_tokens": 71832376.0, + "rewards/KL_reward/mean": -33.524112701416016, + "rewards/KL_reward/std": 1.811699628829956, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, + "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, + "step": 794 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 23.578125, + "epoch": 0.9013605442176871, + "grad_norm": 0.10701996088027954, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 71867634.0, + "rewards/KL_reward/mean": -33.605445861816406, + "rewards/KL_reward/std": 3.248577356338501, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.021003132686018944, + "rewards/thinking_verbosity_reward/std": 0.23762330412864685, + "step": 795 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9024943310657596, + "grad_norm": 0.019463684409856796, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 71899778.0, + "rewards/KL_reward/mean": -34.059669494628906, + "rewards/KL_reward/std": 0.766512393951416, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 796 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9036281179138322, + "grad_norm": 0.15101948380470276, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 71931714.0, + "rewards/KL_reward/mean": -33.3520622253418, + "rewards/KL_reward/std": 1.259179949760437, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 797 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9047619047619048, + "grad_norm": 0.005296964664012194, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 71964066.0, + "rewards/KL_reward/mean": -33.70747375488281, + "rewards/KL_reward/std": 2.183056354522705, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 798 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9058956916099773, + "grad_norm": 0.025313332676887512, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 71995730.0, + "rewards/KL_reward/mean": -34.05757141113281, + "rewards/KL_reward/std": 1.0717991590499878, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 799 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.015625, + "epoch": 0.9070294784580499, + "grad_norm": 1.1392728090286255, + "learning_rate": 5e-05, + "loss": 0.0007, + "num_tokens": 72028132.0, + "rewards/KL_reward/mean": -33.22903823852539, + "rewards/KL_reward/std": 2.581110954284668, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, + "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, + "step": 800 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9081632653061225, + "grad_norm": 0.0020190200302749872, + "learning_rate": 5e-05, + "loss": 0.0014, + "num_tokens": 72060028.0, + "rewards/KL_reward/mean": -33.52140808105469, + "rewards/KL_reward/std": 1.0689212083816528, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 801 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.909297052154195, + "grad_norm": 0.10467184334993362, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 72092500.0, + "rewards/KL_reward/mean": -33.260520935058594, + "rewards/KL_reward/std": 1.2949765920639038, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 802 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9104308390022676, + "grad_norm": 0.0027886980678886175, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 72124692.0, + "rewards/KL_reward/mean": -33.974403381347656, + "rewards/KL_reward/std": 1.119321346282959, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 803 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9115646258503401, + "grad_norm": 0.010387484915554523, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 72157340.0, + "rewards/KL_reward/mean": -32.96565246582031, + "rewards/KL_reward/std": 1.5892037153244019, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 804 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9126984126984127, + "grad_norm": 0.0026018789503723383, + "learning_rate": 5e-05, + "loss": -0.0, + "num_tokens": 72189516.0, + "rewards/KL_reward/mean": -33.66239547729492, + "rewards/KL_reward/std": 0.877173900604248, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 805 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0234375, + "epoch": 0.9138321995464853, + "grad_norm": 0.0786629393696785, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_tokens": 72221863.0, + "rewards/KL_reward/mean": -33.49411392211914, + "rewards/KL_reward/std": 2.0544230937957764, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, + "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, + "step": 806 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9149659863945578, + "grad_norm": 0.0015710083534941077, + "learning_rate": 5e-05, + "loss": -0.0009, + "num_tokens": 72254455.0, + "rewards/KL_reward/mean": -33.7327880859375, + "rewards/KL_reward/std": 0.9656606912612915, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 807 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9160997732426304, + "grad_norm": 0.009940247051417828, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 72286591.0, + "rewards/KL_reward/mean": -33.457847595214844, + "rewards/KL_reward/std": 1.3197795152664185, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 808 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9172335600907029, + "grad_norm": 0.002746545011177659, + "learning_rate": 5e-05, + "loss": 0.0012, + "num_tokens": 72318567.0, + "rewards/KL_reward/mean": -33.70878219604492, + "rewards/KL_reward/std": 1.5371614694595337, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 809 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9183673469387755, + "grad_norm": 0.003681495087221265, + "learning_rate": 5e-05, + "loss": 0.0011, + "num_tokens": 72350511.0, + "rewards/KL_reward/mean": -33.79541778564453, + "rewards/KL_reward/std": 1.3166762590408325, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 810 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9195011337868481, + "grad_norm": 0.0012977722799405456, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 72382647.0, + "rewards/KL_reward/mean": -33.16996765136719, + "rewards/KL_reward/std": 1.4044597148895264, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 811 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9206349206349206, + "grad_norm": 0.0006923092296347022, + "learning_rate": 5e-05, + "loss": 0.0, + "num_tokens": 72414679.0, + "rewards/KL_reward/mean": -33.57663345336914, + "rewards/KL_reward/std": 0.7723235487937927, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 812 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9217687074829932, + "grad_norm": 0.002347388304769993, + "learning_rate": 5e-05, + "loss": 0.0013, + "num_tokens": 72446583.0, + "rewards/KL_reward/mean": -33.96721267700195, + "rewards/KL_reward/std": 1.482001781463623, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 813 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9229024943310657, + "grad_norm": 0.005274900700896978, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 72478815.0, + "rewards/KL_reward/mean": -33.25440216064453, + "rewards/KL_reward/std": 1.6389936208724976, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 814 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0234375, + "epoch": 0.9240362811791383, + "grad_norm": 0.7483558058738708, + "learning_rate": 5e-05, + "loss": -0.0006, + "num_tokens": 72511170.0, + "rewards/KL_reward/mean": -33.375099182128906, + "rewards/KL_reward/std": 2.3296220302581787, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, + "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, + "step": 815 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9251700680272109, + "grad_norm": 0.007809279952198267, + "learning_rate": 5e-05, + "loss": -0.0024, + "num_tokens": 72542842.0, + "rewards/KL_reward/mean": -33.83318328857422, + "rewards/KL_reward/std": 1.2061774730682373, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 816 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9263038548752834, + "grad_norm": 0.0041608852334320545, + "learning_rate": 5e-05, + "loss": -0.0011, + "num_tokens": 72574522.0, + "rewards/KL_reward/mean": -34.07609558105469, + "rewards/KL_reward/std": 0.9378816485404968, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 817 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0234375, + "epoch": 0.927437641723356, + "grad_norm": 0.6836727857589722, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 72606389.0, + "rewards/KL_reward/mean": -33.83430099487305, + "rewards/KL_reward/std": 1.9606149196624756, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0005524271982721984, + "rewards/thinking_verbosity_reward/std": 0.0062500000931322575, + "step": 818 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9285714285714286, + "grad_norm": 0.0031801860313862562, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 72638325.0, + "rewards/KL_reward/mean": -32.74711990356445, + "rewards/KL_reward/std": 1.9061007499694824, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 819 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9297052154195011, + "grad_norm": 0.0025071382988244295, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 72670381.0, + "rewards/KL_reward/mean": -33.63817596435547, + "rewards/KL_reward/std": 1.056267261505127, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 820 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9308390022675737, + "grad_norm": 0.010071910917758942, + "learning_rate": 5e-05, + "loss": -0.0017, + "num_tokens": 72702093.0, + "rewards/KL_reward/mean": -33.934120178222656, + "rewards/KL_reward/std": 1.2471729516983032, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 821 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9319727891156463, + "grad_norm": 0.0020680369343608618, + "learning_rate": 5e-05, + "loss": -0.0003, + "num_tokens": 72734269.0, + "rewards/KL_reward/mean": -33.41252136230469, + "rewards/KL_reward/std": 1.0085152387619019, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 822 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.015625, + "epoch": 0.9331065759637188, + "grad_norm": 0.061341334134340286, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_tokens": 72766319.0, + "rewards/KL_reward/mean": -33.721580505371094, + "rewards/KL_reward/std": 1.2540863752365112, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 823 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9342403628117913, + "grad_norm": 0.002179608680307865, + "learning_rate": 5e-05, + "loss": -0.0007, + "num_tokens": 72798607.0, + "rewards/KL_reward/mean": -33.82682800292969, + "rewards/KL_reward/std": 1.0525864362716675, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 824 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.935374149659864, + "grad_norm": 0.0029823859222233295, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 72830295.0, + "rewards/KL_reward/mean": -33.834556579589844, + "rewards/KL_reward/std": 1.2774964570999146, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 825 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9365079365079365, + "grad_norm": 0.0008695355500094593, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 72862535.0, + "rewards/KL_reward/mean": -33.31136703491211, + "rewards/KL_reward/std": 0.9780426025390625, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 826 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.937641723356009, + "grad_norm": 0.03273223340511322, + "learning_rate": 5e-05, + "loss": -0.001, + "num_tokens": 72894839.0, + "rewards/KL_reward/mean": -33.34189224243164, + "rewards/KL_reward/std": 1.4788143634796143, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 827 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.9387755102040817, + "grad_norm": 0.5189052820205688, + "learning_rate": 5e-05, + "loss": -0.0015, + "num_tokens": 72926696.0, + "rewards/KL_reward/mean": -33.972129821777344, + "rewards/KL_reward/std": 2.086747169494629, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 828 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9399092970521542, + "grad_norm": 0.0020410194993019104, + "learning_rate": 5e-05, + "loss": -0.0007, + "num_tokens": 72958368.0, + "rewards/KL_reward/mean": -34.186222076416016, + "rewards/KL_reward/std": 1.0480031967163086, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 829 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9410430839002267, + "grad_norm": 0.061405811458826065, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 72991000.0, + "rewards/KL_reward/mean": -33.245933532714844, + "rewards/KL_reward/std": 0.804006040096283, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 830 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9421768707482994, + "grad_norm": 0.0010703267762437463, + "learning_rate": 5e-05, + "loss": -0.0005, + "num_tokens": 73022864.0, + "rewards/KL_reward/mean": -33.59266662597656, + "rewards/KL_reward/std": 1.5360360145568848, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 831 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9433106575963719, + "grad_norm": 0.0038445971440523863, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 73054808.0, + "rewards/KL_reward/mean": -33.263980865478516, + "rewards/KL_reward/std": 1.106505274772644, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 832 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9444444444444444, + "grad_norm": 0.007286259904503822, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 73087216.0, + "rewards/KL_reward/mean": -33.16855239868164, + "rewards/KL_reward/std": 1.4726207256317139, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 833 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9455782312925171, + "grad_norm": 0.0012580020120367408, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 73119296.0, + "rewards/KL_reward/mean": -34.0222053527832, + "rewards/KL_reward/std": 1.0648142099380493, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 834 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0078125, + "epoch": 0.9467120181405896, + "grad_norm": 0.592292070388794, + "learning_rate": 5e-05, + "loss": -0.001, + "num_tokens": 73151793.0, + "rewards/KL_reward/mean": -33.82099151611328, + "rewards/KL_reward/std": 2.220374584197998, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 835 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9478458049886621, + "grad_norm": 0.0006550090620294213, + "learning_rate": 5e-05, + "loss": -0.0005, + "num_tokens": 73182945.0, + "rewards/KL_reward/mean": -33.92092514038086, + "rewards/KL_reward/std": 0.9080770611763, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 836 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9489795918367347, + "grad_norm": 0.1275930404663086, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_tokens": 73214609.0, + "rewards/KL_reward/mean": -33.773582458496094, + "rewards/KL_reward/std": 1.3788979053497314, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 837 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9501133786848073, + "grad_norm": 0.2163899689912796, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 73246073.0, + "rewards/KL_reward/mean": -33.065582275390625, + "rewards/KL_reward/std": 1.198525071144104, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 838 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9512471655328798, + "grad_norm": 0.4263833463191986, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 73278121.0, + "rewards/KL_reward/mean": -33.5583381652832, + "rewards/KL_reward/std": 1.0508912801742554, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 839 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9523809523809523, + "grad_norm": 0.007769063580781221, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 73309825.0, + "rewards/KL_reward/mean": -33.93596649169922, + "rewards/KL_reward/std": 1.1275330781936646, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 840 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.953514739229025, + "grad_norm": 0.002262361813336611, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_tokens": 73341953.0, + "rewards/KL_reward/mean": -33.48320007324219, + "rewards/KL_reward/std": 0.9198437333106995, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 841 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9546485260770975, + "grad_norm": 0.0017249841475859284, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 73373857.0, + "rewards/KL_reward/mean": -33.71734619140625, + "rewards/KL_reward/std": 1.1636419296264648, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 842 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.95578231292517, + "grad_norm": 0.005231876391917467, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 73405865.0, + "rewards/KL_reward/mean": -32.96625518798828, + "rewards/KL_reward/std": 1.9772436618804932, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 843 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9569160997732427, + "grad_norm": 0.22509323060512543, + "learning_rate": 5e-05, + "loss": -0.0011, + "num_tokens": 73437481.0, + "rewards/KL_reward/mean": -33.50747299194336, + "rewards/KL_reward/std": 1.3297713994979858, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 844 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.03125, + "epoch": 0.9580498866213152, + "grad_norm": 0.43399322032928467, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 73469845.0, + "rewards/KL_reward/mean": -33.005149841308594, + "rewards/KL_reward/std": 2.149930477142334, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.06629125773906708, + "rewards/angle_reward/std": 0.24199791252613068, + "rewards/thinking_verbosity_reward/mean": -0.0007812500116415322, + "rewards/thinking_verbosity_reward/std": 0.006225345656275749, + "step": 845 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9591836734693877, + "grad_norm": 0.0004079754580743611, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 73502325.0, + "rewards/KL_reward/mean": -34.061588287353516, + "rewards/KL_reward/std": 0.9618063569068909, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 846 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9603174603174603, + "grad_norm": 0.000760400842409581, + "learning_rate": 5e-05, + "loss": 0.0017, + "num_tokens": 73534797.0, + "rewards/KL_reward/mean": -33.50699996948242, + "rewards/KL_reward/std": 0.8282644152641296, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 847 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9614512471655329, + "grad_norm": 0.020066983997821808, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 73567221.0, + "rewards/KL_reward/mean": -33.0126953125, + "rewards/KL_reward/std": 1.1289455890655518, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 848 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9625850340136054, + "grad_norm": 0.004405636806041002, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 73599277.0, + "rewards/KL_reward/mean": -33.39598083496094, + "rewards/KL_reward/std": 1.8640003204345703, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 849 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.963718820861678, + "grad_norm": 0.0047976113855838776, + "learning_rate": 5e-05, + "loss": 0.0015, + "num_tokens": 73630781.0, + "rewards/KL_reward/mean": -33.9675178527832, + "rewards/KL_reward/std": 1.5000041723251343, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 850 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9648526077097506, + "grad_norm": 0.005626079626381397, + "learning_rate": 5e-05, + "loss": 0.0003, + "num_tokens": 73662093.0, + "rewards/KL_reward/mean": -34.15525436401367, + "rewards/KL_reward/std": 1.5996448993682861, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 851 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9659863945578231, + "grad_norm": 0.007012875750660896, + "learning_rate": 5e-05, + "loss": 0.0009, + "num_tokens": 73694517.0, + "rewards/KL_reward/mean": -33.43494415283203, + "rewards/KL_reward/std": 1.1145339012145996, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 852 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9671201814058957, + "grad_norm": 0.003037257120013237, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 73726869.0, + "rewards/KL_reward/mean": -33.14628982543945, + "rewards/KL_reward/std": 1.6235324144363403, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 853 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9682539682539683, + "grad_norm": 0.002727856859564781, + "learning_rate": 5e-05, + "loss": -0.0004, + "num_tokens": 73759517.0, + "rewards/KL_reward/mean": -33.35026550292969, + "rewards/KL_reward/std": 1.4061540365219116, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 854 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9693877551020408, + "grad_norm": 0.14295855164527893, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 73791589.0, + "rewards/KL_reward/mean": -33.52638244628906, + "rewards/KL_reward/std": 1.6596015691757202, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 855 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9705215419501134, + "grad_norm": 0.02205885760486126, + "learning_rate": 5e-05, + "loss": -0.0006, + "num_tokens": 73823589.0, + "rewards/KL_reward/mean": -33.47103500366211, + "rewards/KL_reward/std": 1.1936787366867065, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 856 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.971655328798186, + "grad_norm": 0.004215400665998459, + "learning_rate": 5e-05, + "loss": -0.0015, + "num_tokens": 73855749.0, + "rewards/KL_reward/mean": -33.84236145019531, + "rewards/KL_reward/std": 1.2295082807540894, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 857 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9727891156462585, + "grad_norm": 0.002084847306832671, + "learning_rate": 5e-05, + "loss": -0.0002, + "num_tokens": 73887389.0, + "rewards/KL_reward/mean": -33.31908416748047, + "rewards/KL_reward/std": 0.9242004156112671, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 858 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9739229024943311, + "grad_norm": 0.010090108960866928, + "learning_rate": 5e-05, + "loss": -0.0014, + "num_tokens": 73919005.0, + "rewards/KL_reward/mean": -33.2784423828125, + "rewards/KL_reward/std": 2.2050771713256836, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 859 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9750566893424036, + "grad_norm": 0.014052431099116802, + "learning_rate": 5e-05, + "loss": 0.0027, + "num_tokens": 73951565.0, + "rewards/KL_reward/mean": -33.448123931884766, + "rewards/KL_reward/std": 2.091165781021118, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 860 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9761904761904762, + "grad_norm": 0.0007639123359695077, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 73983653.0, + "rewards/KL_reward/mean": -34.18672561645508, + "rewards/KL_reward/std": 0.844143807888031, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 861 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9773242630385488, + "grad_norm": 0.0013192676706239581, + "learning_rate": 5e-05, + "loss": 0.0043, + "num_tokens": 74016085.0, + "rewards/KL_reward/mean": -33.311458587646484, + "rewards/KL_reward/std": 1.3442423343658447, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 862 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9784580498866213, + "grad_norm": 0.010709281079471111, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 74048101.0, + "rewards/KL_reward/mean": -33.879364013671875, + "rewards/KL_reward/std": 1.5034685134887695, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 863 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.015625, + "epoch": 0.9795918367346939, + "grad_norm": 0.4506100118160248, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 74079879.0, + "rewards/KL_reward/mean": -34.168434143066406, + "rewards/KL_reward/std": 1.8324439525604248, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 864 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9807256235827665, + "grad_norm": 0.00034951631096191704, + "learning_rate": 5e-05, + "loss": -0.002, + "num_tokens": 74111447.0, + "rewards/KL_reward/mean": -33.98359680175781, + "rewards/KL_reward/std": 1.1653918027877808, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 865 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.981859410430839, + "grad_norm": 0.00833884160965681, + "learning_rate": 5e-05, + "loss": 0.0019, + "num_tokens": 74143159.0, + "rewards/KL_reward/mean": -33.90995407104492, + "rewards/KL_reward/std": 1.0673109292984009, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 866 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9829931972789115, + "grad_norm": 0.015380697324872017, + "learning_rate": 5e-05, + "loss": 0.0006, + "num_tokens": 74175335.0, + "rewards/KL_reward/mean": -33.579200744628906, + "rewards/KL_reward/std": 1.330080509185791, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 867 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9841269841269841, + "grad_norm": 0.0010960256913676858, + "learning_rate": 5e-05, + "loss": -0.0001, + "num_tokens": 74207391.0, + "rewards/KL_reward/mean": -33.26451873779297, + "rewards/KL_reward/std": 0.9738898873329163, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 868 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9852607709750567, + "grad_norm": 0.0006172276334837079, + "learning_rate": 5e-05, + "loss": -0.0015, + "num_tokens": 74239719.0, + "rewards/KL_reward/mean": -33.499027252197266, + "rewards/KL_reward/std": 1.2270981073379517, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 869 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9863945578231292, + "grad_norm": 0.0011030619498342276, + "learning_rate": 5e-05, + "loss": -0.001, + "num_tokens": 74271535.0, + "rewards/KL_reward/mean": -33.678924560546875, + "rewards/KL_reward/std": 1.0721451044082642, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 870 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9875283446712018, + "grad_norm": 0.0012125244829803705, + "learning_rate": 5e-05, + "loss": -0.0008, + "num_tokens": 74303423.0, + "rewards/KL_reward/mean": -33.96786117553711, + "rewards/KL_reward/std": 1.3098315000534058, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 871 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9886621315192744, + "grad_norm": 0.0038846044335514307, + "learning_rate": 5e-05, + "loss": -0.0018, + "num_tokens": 74336031.0, + "rewards/KL_reward/mean": -33.63890838623047, + "rewards/KL_reward/std": 1.1418358087539673, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 872 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9897959183673469, + "grad_norm": 0.0008887458825483918, + "learning_rate": 5e-05, + "loss": -0.0017, + "num_tokens": 74367703.0, + "rewards/KL_reward/mean": -33.608436584472656, + "rewards/KL_reward/std": 0.9306058883666992, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 873 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.015625, + "epoch": 0.9909297052154195, + "grad_norm": 1.6916550397872925, + "learning_rate": 5e-05, + "loss": -0.0017, + "num_tokens": 74399681.0, + "rewards/KL_reward/mean": -33.1297721862793, + "rewards/KL_reward/std": 2.368335247039795, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.03314562886953354, + "rewards/angle_reward/std": 0.17432378232479095, + "rewards/thinking_verbosity_reward/mean": -0.0003906250058207661, + "rewards/thinking_verbosity_reward/std": 0.0044194171205163, + "step": 874 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9920634920634921, + "grad_norm": 0.0032412756700068712, + "learning_rate": 5e-05, + "loss": -0.0006, + "num_tokens": 74432561.0, + "rewards/KL_reward/mean": -33.358238220214844, + "rewards/KL_reward/std": 1.5701112747192383, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 875 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9931972789115646, + "grad_norm": 0.14960019290447235, + "learning_rate": 5e-05, + "loss": -0.0006, + "num_tokens": 74465073.0, + "rewards/KL_reward/mean": -33.12785339355469, + "rewards/KL_reward/std": 1.2311831712722778, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 876 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9943310657596371, + "grad_norm": 0.0003175087331328541, + "learning_rate": 5e-05, + "loss": 0.0004, + "num_tokens": 74496385.0, + "rewards/KL_reward/mean": -33.74147415161133, + "rewards/KL_reward/std": 1.2235573530197144, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 877 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9954648526077098, + "grad_norm": 0.004698865581303835, + "learning_rate": 5e-05, + "loss": 0.0001, + "num_tokens": 74528537.0, + "rewards/KL_reward/mean": -34.1864013671875, + "rewards/KL_reward/std": 1.1026948690414429, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 878 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9965986394557823, + "grad_norm": 0.0013571645831689239, + "learning_rate": 5e-05, + "loss": 0.0002, + "num_tokens": 74560633.0, + "rewards/KL_reward/mean": -33.780052185058594, + "rewards/KL_reward/std": 1.0348597764968872, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 879 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9977324263038548, + "grad_norm": 0.0004332130483817309, + "learning_rate": 5e-05, + "loss": 0.0024, + "num_tokens": 74591257.0, + "rewards/KL_reward/mean": -34.17106628417969, + "rewards/KL_reward/std": 1.0338495969772339, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 880 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 0.9988662131519275, + "grad_norm": 0.019798073917627335, + "learning_rate": 5e-05, + "loss": 0.0008, + "num_tokens": 74623361.0, + "rewards/KL_reward/mean": -33.206809997558594, + "rewards/KL_reward/std": 1.5338208675384521, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 881 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/mean_length": 1.0, + "epoch": 1.0, + "grad_norm": 0.009082326665520668, + "learning_rate": 5e-05, + "loss": 0.0005, + "num_tokens": 74655785.0, + "rewards/KL_reward/mean": -33.23743438720703, + "rewards/KL_reward/std": 1.4101316928863525, + "rewards/accuracy_reward/mean": 0.0, + "rewards/accuracy_reward/std": 0.0, + "rewards/angle_reward/mean": 0.0, + "rewards/angle_reward/std": 0.0, + "rewards/thinking_verbosity_reward/mean": 0.0, + "rewards/thinking_verbosity_reward/std": 0.0, + "step": 882 + } + ], + "logging_steps": 1, + "max_steps": 882, + "num_input_tokens_seen": 74655785, + "num_train_epochs": 1, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}